import read_data as data import settings i = 1 articles = [] with open(settings.PATH_ARTICLES, 'rb') as a: for article in data.iter_annotations(a): i += 1 outPath = settings.PATH_OUTPUT + article.page_name # print page name print(article.page_name, '\n') # print page main sections print([(section.heading, len(children)) for (section, children) in article.deep_headings_list()], '\n') # print page all sections print(["/".join([section.heading for section in sectionpath]) for sectionpath in article.flat_headings_list()], '\n') # save the article with open(outPath, 'w', encoding='utf-8') as processed_article: processed_article.write(article.to_string()) if settings.NUM_ARTICLES != -1 and i > settings.NUM_ARTICLES: break
#ns.entities = util.get_entities() #entities = util.get_entities() #partial_extract = partial(extract, ns.entities) #partial_extract = partial(extract, get_article_entities(entities, None)) p = Pool(processes=cores, initializer=initialize_entities, initargs=(), maxtasksperchild=200) with open(settings.PATH_ARTICLES, 'rb') as a: j = 0 tic = time.time() try: for work in p.imap(extract, data.iter_annotations(a), chunksize=1000): j += 1 print(j) except Exception as e: with open('STATS_ERRORS.txt', 'a') as err: err.write(a.page_id) print('\n') print('JOINING NOW') p.close() p.join()
"train.pages.cbor" ] print("Select a number: ") while True: x = int(input("> ")) if x == 1: print(pages) print("\nWhich data would you like to see?\n") while True: y = int(input("> ")) if y == 1: for page in read_data.iter_annotations(open(pages[0], 'rb')): print(page.page_id) break if y == 2: for page in read_data.iter_annotations(open(pages[1], 'rb')): print(page.page_id) break if y == 3: for page in read_data.iter_annotations(open(pages[2], 'rb')): print(page.page_id) break else: print("That's an invalid choice, try again.")
def main(): s = 0 p = 0 i = 1 store = 1000 #500 part = 50 #100000 cores = 2 # 1 - 500000 + loop start = 100 end = 600 #0 opts, args = getopt.getopt(sys.argv[1:], 'c:p:s:e:') for opt, arg in opts: if opt == '-c': cores = int(arg) if opt == '-p': part = int(arg) if opt == '-s': start = int(arg) if opt == '-e': end = int(arg) print('workers: ' + str(cores)) print('articles: ' + str(start) + ' - ' + str(end)) dictionary = pd.DataFrame(columns=['entity', 'entity_id'], dtype='unicode', index=None) jobs = None if os.name == 'nt': jobs = Pool(processes=cores, initializer=annotator.initialize_knowledgebase, initargs=(), maxtasksperchild=250) else: annotator.initialize_knowledgebase() jobs = Pool(processes=cores, initargs=(), maxtasksperchild=250) with open(settings.PATH_ARTICLES, 'rb') as a: corpus = '' iterable = iter(data.iter_annotations(a)) for work in jobs.imap_unordered(annotator.annotate, islice(iterable, start, end), chunksize=2000): if p > part: p = 0 i += 1 annotations, article_body = work if article_body is not None: s += 1 p += 1 dictionary = dictionary.append(annotations) corpus += article_body corpus += '\n\n' if s % store == 0: with open(settings.PATH_OUTPUT + str(start) + ' part ' + str(i) + '.txt', 'a', encoding='utf-8') as b: b.write(corpus) corpus = '' print('\n') print('JOINING NOW') jobs.close() jobs.join() dictionary.drop_duplicates(inplace=True) dictionary.to_csv(settings.PATH_OUTPUT + str(start) + ' entities_dictionary.csv', chunksize=10000, encoding='utf-8', index=False) #pickle.dump(annotator.error_articles, open(settings.PATH_OUTPUT + 'error_articles.pickle', 'wb')) print('\n') print('DONE') print('Total number of articles: ' + str(s))