Exemple #1
0
def main():
    #### PREPROCESSING ###################################
    # obtain a dict with the metadata on each text:
    meta = metadata.metadata_dict()
    # convert the articles in the original xml files to plain text:
    preprocessing.parse_secondary_dbnl(max_documents=60000)
    # frog the plain articles:
    # preprocessing.frog_articles()
    """
    #### WIKIFICATION ###################################
    # construct the wikifier:
    wikifier = Wikifier()
    # collect relevant page_ids for your categories
    wikifier.relevant_page_ids(fresh=False)
    page_ids = wikifier.page_ids[:100]
    # collect ids of pages that backlink to your relevant pages:
    wikifier.backlinking_pages(page_ids=page_ids, fresh=False)
    # collect all mentions of the target pages in the backlinks (this will take a while!)
    wikifier.mentions_from_backlinks(fresh=False)
    # turn the collected mentions into a matrix (vectorization)
    input_dim, output_dim = wikifier.vectorize_wiki_mentions(fresh=False, max_features=500)
    # optimize a classifier to label new mentions:
    #dev_acc, test_acc = wikifier.classifier(input_dim=input_dim, output_dim=output_dim, fresh=True, test=True, nb_epochs=2)
    # train the final classifier on all data:
    wikifier.classifier(input_dim=input_dim, output_dim=output_dim, fresh=False, test=False, nb_epochs=1)
    ######## (the following is specific to the dbnl data) #######################################################
    # collect all unique NEs in the corpus
    # and get the pages which the wikipedia search interface links them to:
    wikifier.extract_unique_nes(fresh=False, max_documents=100, max_words_per_doc=150)
    # use the trained wikifier to disambiguate the NEs in the corpus:
    wikifier.disambiguate_nes(max_documents=100, max_words_per_doc=150)
    """
    """
Exemple #2
0
def parse_secondary_dbnl(max_documents=100):
    """
    Parses all xml-files under the ../texts directory.
    Only considers files with:
        - genre = 'sec - letterkunde'
        - subgenre = 'tijdschrift / jaarboek'
        - 1945 > date < 2002
    Additionally, only Dutch-language articles will be included.
    Only outputs articles which are recognized as 'nl'
    All individual 'chapters' (i.e. articles) are saved separately in ../workspace/periodicals
    """

    year_counts = Counter()

    # get metadata
    metadata_dict = metadata.metadata_dict()
    
    # keep track:
    document_cnt = 0 # nb of documents (i.e. 'journal issues')
    article_cnt = 0 # nb of chapters (i.e. 'articles/reviews')

    # initalize directories:
    if not os.path.isdir('../workspace'):
        os.mkdir('../workspace')
    if not os.path.isdir('../figures'):
        os.mkdir('../figures')
    if os.path.isdir('../workspace/periodicals'):
        shutil.rmtree('../workspace/periodicals')
    os.mkdir('../workspace/periodicals')

    # iterate over the full texts which we have:
    for filepath in glob.glob('../texts/*.xml'):

        text_id = os.path.splitext(os.path.basename(filepath))[0][:-3] # remove trailing "_01"

        # see whether we have all the necessary metadata for the text:
        try:
            title = metadata_dict[text_id]['title']
            date = metadata_dict[text_id]['year']
            genre = metadata_dict[text_id]['genre']
            subgenre = metadata_dict[text_id]['subgenre']
        except KeyError:
            continue

        # limited to post-war studies on literature in periodicals:
        if genre == 'sec - letterkunde' and \
            subgenre == 'tijdschrift / jaarboek' and \
            date > 1945 and date < 2002 and date != "???":

            print(">>>", title)

            # collect the individual articles in the issue:
            articles = xml_to_articles(filepath)
            if articles:
                for idx, article in enumerate(articles):
                    new_filepath = '../workspace/periodicals/'
                    new_filepath += text_id+"-"+str(idx+1)+'-'+str(date)+'.txt'
                    with codecs.open(new_filepath, 'w', 'utf-8') as f:
                        f.write(article)

                    # update stats:
                    article_cnt += 1
                    year_counts[date] += 1

            # update cnts:
            document_cnt += 1

        if document_cnt >= max_documents:
            break

    print('nb issues parsed:', document_cnt)
    print('nb individual articles extracted:', article_cnt)

    # visualize distribution over time:
    cnts = sorted(year_counts.items())
    output_file('../figures/nb_articles_yearly.html')
    p = figure(plot_width=1200, plot_height=400, x_axis_label='year', y_axis_label='nb articles')
    p.line([y for y,_ in cnts], [c for _,c in cnts], line_width=2)
    save(p)
Exemple #3
0
def parse_secondary_dbnl(max_documents=100):
    """
    Parses all xml-files under the ../texts directory.
    Only considers files with:
        - genre = 'sec - letterkunde'
        - subgenre = 'tijdschrift / jaarboek'
        - date > 1945
    Additionally, only Dutch-language articles will be included.
    Only outputs articles which are recognized as 'nl'
    All individual 'chapters' (i.e. articles) are saved separately in ../workspace/periodicals
    """

    year_counts = Counter()

    # get metadata
    metadata_dict = metadata.metadata_dict()

    # keep track:
    document_cnt = 0  # nb of documents (i.e. 'journal issues')
    article_cnt = 0  # nb of chapters (i.e. 'articles/reviews')

    # initalize directories:
    if not os.path.isdir('../workspace'):
        os.mkdir('../workspace')
    if not os.path.isdir('../figures'):
        os.mkdir('../figures')
    if os.path.isdir('../workspace/periodicals_tmp'):
        shutil.rmtree('../workspace/periodicals_tmp')
    os.mkdir('../workspace/periodicals_tmp')

    # iterate over the full texts which we have:
    for filepath in glob.glob('../texts/*.xml'):

        text_id = os.path.splitext(
            os.path.basename(filepath))[0][:-3]  # remove trailing "_01"

        # see whether we have all the necessary metadata for the text:
        try:
            title = metadata_dict[text_id]['title']
            date = metadata_dict[text_id]['year']
            genre = metadata_dict[text_id]['genre']
            subgenre = metadata_dict[text_id]['subgenre']
        except KeyError:
            continue

        # limited to post-war studies on literature in periodicals:
        if genre == 'sec - letterkunde' and \
            subgenre == 'tijdschrift / jaarboek' and \
            date > 2001 and date != "???":

            print(">>>", title)

            # collect the individual articles in the issue:
            articles = xml_to_articles(filepath)
            if articles:
                for idx, article in enumerate(articles):
                    new_filepath = '../workspace/periodicals_tmp/'
                    new_filepath += text_id + "-" + str(idx + 1) + '-' + str(
                        date) + '.txt'
                    with codecs.open(new_filepath, 'w', 'utf-8') as f:
                        f.write(article)

                    # update stats:
                    article_cnt += 1
                    year_counts[date] += 1

            # update cnts:
            document_cnt += 1

        if document_cnt >= max_documents:
            break

    print('nb issues parsed:', document_cnt)
    print('nb individual articles extracted:', article_cnt)

    # visualize distribution over time:
    cnts = sorted(year_counts.items())
    output_file('../figures/nb_articles_yearly.html')
    p = figure(plot_width=1200,
               plot_height=400,
               x_axis_label='year',
               y_axis_label='nb articles')
    p.line([y for y, _ in cnts], [c for _, c in cnts], line_width=2)
    save(p)