Esempio n. 1
0
import read_data as data
import settings

i = 1

articles = []

with open(settings.PATH_ARTICLES, 'rb') as a:
    for article in data.iter_annotations(a):
        i += 1
        outPath = settings.PATH_OUTPUT + article.page_name

        # print page name
        print(article.page_name, '\n')
        # print page main sections
        print([(section.heading, len(children)) for (section, children) in article.deep_headings_list()], '\n')
        # print page all sections
        print(["/".join([section.heading for section in sectionpath]) for sectionpath in article.flat_headings_list()], '\n')

        # save the article
        with open(outPath, 'w', encoding='utf-8') as processed_article:
            processed_article.write(article.to_string())

        if settings.NUM_ARTICLES != -1 and i > settings.NUM_ARTICLES: break
Esempio n. 2
0
    #ns.entities = util.get_entities()
    #entities = util.get_entities()

    #partial_extract = partial(extract, ns.entities)
    #partial_extract = partial(extract, get_article_entities(entities, None))

    p = Pool(processes=cores, initializer=initialize_entities, initargs=(), maxtasksperchild=200)

    with open(settings.PATH_ARTICLES, 'rb') as a:

        j = 0
        tic = time.time()

        try:

            for work in p.imap(extract, data.iter_annotations(a), chunksize=1000):

                j += 1
                print(j)

        except Exception as e:

            with open('STATS_ERRORS.txt', 'a') as err:
                err.write(a.page_id)

    print('\n')
    print('JOINING NOW')

    p.close()
    p.join()
    "train.pages.cbor"
]

print("Select a number: ")

while True:
    x = int(input("> "))

    if x == 1:
        print(pages)
        print("\nWhich data would you like to see?\n")

        while True:
            y = int(input("> "))
            if y == 1:
                for page in read_data.iter_annotations(open(pages[0], 'rb')):
                    print(page.page_id)
                break

            if y == 2:
                for page in read_data.iter_annotations(open(pages[1], 'rb')):
                    print(page.page_id)
                break

            if y == 3:
                for page in read_data.iter_annotations(open(pages[2], 'rb')):
                    print(page.page_id)
                break

            else:
                print("That's an invalid choice, try again.")
def main():

    s = 0
    p = 0
    i = 1
    store = 1000  #500
    part = 50  #100000
    cores = 2

    # 1 - 500000 + loop
    start = 100
    end = 600  #0

    opts, args = getopt.getopt(sys.argv[1:], 'c:p:s:e:')
    for opt, arg in opts:
        if opt == '-c':
            cores = int(arg)
        if opt == '-p':
            part = int(arg)
        if opt == '-s':
            start = int(arg)
        if opt == '-e':
            end = int(arg)

    print('workers: ' + str(cores))
    print('articles: ' + str(start) + ' - ' + str(end))

    dictionary = pd.DataFrame(columns=['entity', 'entity_id'],
                              dtype='unicode',
                              index=None)
    jobs = None

    if os.name == 'nt':

        jobs = Pool(processes=cores,
                    initializer=annotator.initialize_knowledgebase,
                    initargs=(),
                    maxtasksperchild=250)

    else:

        annotator.initialize_knowledgebase()
        jobs = Pool(processes=cores, initargs=(), maxtasksperchild=250)

    with open(settings.PATH_ARTICLES, 'rb') as a:

        corpus = ''
        iterable = iter(data.iter_annotations(a))
        for work in jobs.imap_unordered(annotator.annotate,
                                        islice(iterable, start, end),
                                        chunksize=2000):

            if p > part:
                p = 0
                i += 1

            annotations, article_body = work
            if article_body is not None:
                s += 1
                p += 1

                dictionary = dictionary.append(annotations)
                corpus += article_body
                corpus += '\n\n'

            if s % store == 0:
                with open(settings.PATH_OUTPUT + str(start) + ' part ' +
                          str(i) + '.txt',
                          'a',
                          encoding='utf-8') as b:
                    b.write(corpus)
                    corpus = ''

    print('\n')
    print('JOINING NOW')

    jobs.close()
    jobs.join()

    dictionary.drop_duplicates(inplace=True)
    dictionary.to_csv(settings.PATH_OUTPUT + str(start) +
                      ' entities_dictionary.csv',
                      chunksize=10000,
                      encoding='utf-8',
                      index=False)

    #pickle.dump(annotator.error_articles, open(settings.PATH_OUTPUT + 'error_articles.pickle', 'wb'))

    print('\n')
    print('DONE')

    print('Total number of articles: ' + str(s))