def main(args): input_folder = os.path.join(args.build, 'preprocessed_documents') output_folder = os.path.join(args.build, 'raw_normalized_documents') config = None try: with open('config/config.json') as data: config = json.load(data) # Basic config validation if 'ngrams' not in config: raise Exception( 'Section "ngrams" missing from configuration file') else: for k in copy.deepcopy(config['ngrams']): config['ngrams'][int(k)] = config['ngrams'][k] del config['ngrams'][k] except Exception as e: print('Cannot load configuration file. Details: {}'.format(e)) exit(5) if not args.u: try: if args.f: shutil.rmtree(output_folder) os.mkdir(output_folder) except Exception as e: print(e) update = args.u files = sorted([ os.path.join(input_folder, f) for f in listdir(input_folder) if isfile(join(input_folder, f)) if '_text_without_conclusion.txt' in f ]) raw_corpus = [] corpus_id = [] print('# Load documents') for i, p in enumerate(files): try: sys.stdout.write('\r - Load document {}/{}'.format( i + 1, len(files))) doc_id = p.split('/')[-1].split('_text_without_conclusion.txt')[0] raw_corpus.append(load_text_file(p)) corpus_id.append(doc_id) except Exception as e: print(p, e) normalized_tokens = [] print('\n# Compute tokens') try: for i, doc in enumerate(raw_corpus): filename = os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])) sys.stdout.write('\r - Normalize document {}/{}'.format( i, len(raw_corpus))) if not update or not os.path.isfile(filename): normalized_tokens.append( normalized_step(doc, force=args.f, lemmatization=True)) else: with open(filename, 'r') as f: normalized_tokens.extend(f.read().split()) f.close() except Exception as e: print('\t -> Could not normalized the tokens. Details: {}'.format(e)) exit(40) print('\n# Generate ngrams from tokens') all_grams = [] doc_grammed = [] try: for i, doc in enumerate(normalized_tokens): filename = os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])) sys.stdout.write('\r - Calculate ngrams for document {}/{}'.format( i, len(raw_corpus))) if not update or not os.path.isfile(filename): grams = ngram_step(doc, config['ngrams'], force=args.f) merged = [] for g in grams.values(): merged.extend(g) doc_grammed.append(merged) all_grams.extend(merged) else: print('\t -> Load document as already normalized.') with open(filename, 'r') as f: all_grams.extend(f.read().split()) doc_grammed.append(None) f.close() except Exception as e: print(e) print('') f = Counter(all_grams) print('# Save the full dictionary') with open(os.path.join(output_folder, 'full_dictionary.txt'), 'w') as outfile: json.dump(f, outfile, indent=4, sort_keys=True) print('# Save normalized documents') for i, doc in enumerate(doc_grammed): if doc is not None: sys.stdout.write('\r - Save document {}/{}: {}'.format( i + 1, len(doc_grammed), corpus_id[i])) with open( os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])), 'a') as file: file.write(' '.join(doc)) print('')
def main(args): input_file = os.path.join( args.build, 'cases_info/raw_cases_info.json' if args.processed_folder == 'all' else 'cases_info/raw_cases_info_{}.json'.format(args.processed_folder)) input_folder = os.path.join(args.build, 'raw_normalized_documents') output_folder = os.path.join(args.build, 'processed_documents', args.processed_folder) print('# Read configuration') config = None try: with open(CONFIG_FILE) as data: config = json.load(data) # Basic config validation if 'ngrams' not in config: raise Exception( 'Section "ngrams" missing from configuration file') else: for k in copy.deepcopy(config['ngrams']): config['ngrams'][int(k)] = config['ngrams'][k] del config['ngrams'][k] except Exception as e: print('Cannot load configuration file. Details: {}'.format(e)) exit(5) cases_index = {} with open(input_file, 'r') as f: content = f.read() cases = json.loads(content) cases_index = {c['itemid']: i for i, c in enumerate(cases)} f.close() if not args.u: try: if args.f: shutil.rmtree(output_folder) except Exception as e: print(e) try: os.makedirs(output_folder) except Exception as e: print(e) update = args.u files = [os.path.join(input_folder, f) for f in listdir(input_folder) \ if isfile(join(input_folder, f)) if '_normalized.txt' in f \ and f.split('/')[-1].split('_normalized.txt')[0] in cases_index.keys()] raw_corpus = [] corpus_id = [] print('# Load documents') for i, p in enumerate(files): try: sys.stdout.write('\r - Load document {}/{}'.format( i + 1, len(files))) doc_id = p.split('/')[-1].split('_normalized.txt')[0] raw_corpus.append(load_text_file(p).split()) corpus_id.append(doc_id) except Exception as e: print(p, e) print('') #data = json.load(open('./full_dictionary.txt')) f = [t for doc in raw_corpus for t in doc] f = Counter(f) # Load the raw dictionnary f = f.most_common(args.limit_tokens) words = [w[0] for w in f] #print(words) #print(len(doc_grammed[0]), len(doc_grammed[1])) #print(len(all_grams), len(f)) #dictionary = corpora.Dictionary([all_grams]) print('# Create dictionary') dictionary = corpora.Dictionary([words]) dictionary.save(os.path.join(output_folder, 'dictionary.dict')) with open(os.path.join(output_folder, 'feature_to_id.dict'), 'w') as outfile: json.dump(dictionary.token2id, outfile, indent=4, sort_keys=True) #print(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in raw_corpus] print('# Create Bag of Words') for i, doc in enumerate(corpus): filename = os.path.join(output_folder, '{}_bow.txt'.format(corpus_id[i])) #if update and not os.path.isfile(filename): with open(filename, 'w') as file: for f, v in doc: file.write('{}:{} '.format(f, v)) tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] print('# Create TFIDF') for i, doc in enumerate(corpus_tfidf): with open( os.path.join(output_folder, '{}_tfidf.txt'.format(corpus_id[i])), 'w') as file: for f, v in doc: file.write('{}:{} '.format(f, v))
def run(console, build, title, limit_tokens, doc_ids=None, processed_folder='all', force=False, update=False): __console = console global print print = __console.print input_file = os.path.join( build, 'raw', 'cases_info', 'raw_cases_info_{}.json'.format(processed_folder)) input_folder = os.path.join(build, 'raw', 'normalized_documents') output_folder = os.path.join(build, 'structured') output_folder_tfidf = os.path.join(output_folder, 'tfidf') output_folder_bow = os.path.join(output_folder, 'bow') print(Markdown("- **Step configuration**")) print(TAB + '> Step folder: {}'.format(output_folder_tfidf)) make_build_folder(console, output_folder_tfidf, force, strict=False) print(TAB + '> Step folder: {}'.format(output_folder_bow)) make_build_folder(console, output_folder_bow, force, strict=False) try: config()['steps']['normalize']['ngrams'] except Exception as e: print('Cannot retrieve n-grams configuration. Details: {}'.format(e)) exit(5) print(TAB + '> Read configuration [green][DONE]') cases_index = {} with open(input_file, 'r') as f: content = f.read() cases = json.loads(content) cases_index = {c['itemid']: i for i, c in enumerate(cases)} f.close() files = get_files(doc_ids, input_folder, cases_index) raw_corpus = [] corpus_id = [] print(Markdown('- **Create dictionary**')) with Progress( TAB + "> Loading in memory... [IN PROGRESS]", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console) as progress: task = progress.add_task( "Loading...", total=len(files), error="", doc=files[0].split('/')[-1].split('_normalized.txt')[0]) for i, p in enumerate(files): error = "" try: doc_id = p.split('/')[-1].split('_normalized.txt')[0] raw_corpus.append(load_text_file(p).split()) corpus_id.append(doc_id) except Exception as e: error = '\n| {}'.format('Could not load the document') log.debug(p, e) progress.update(task, advance=1, error=error, doc=doc_id) print(TAB + "> Loading in memory... [green][DONE]") # data = json.load(open('./full_dictionary.txt')) f = [t for doc in raw_corpus for t in doc] f = Counter(f) # Load the raw dictionary f = f.most_common(int(limit_tokens)) words = [w[0] for w in f] # dictionary = corpora.Dictionary([all_grams]) print(TAB + '> Create dictionary') dictionary = corpora.Dictionary([words]) dictionary.save(os.path.join(output_folder, 'dictionary.dict')) with open(os.path.join(output_folder, 'feature_to_id.dict'), 'w') as outfile: json.dump(dictionary.token2id, outfile, indent=4, sort_keys=True) corpus = [dictionary.doc2bow(text) for text in raw_corpus] print(Markdown('- **Create language models**')) with Progress( TAB + "> Create Bag of Word... [IN PROGRESS]", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console) as progress: task = progress.add_task("Loading...", total=len(corpus), error="", doc=corpus_id[0]) for i, doc in enumerate(corpus): error = "" filename = os.path.join(output_folder_bow, '{}_bow.txt'.format(corpus_id[i])) # if update and not os.path.isfile(filename): with open(filename, 'w') as file: for f, v in doc: file.write('{}:{} '.format(f, v)) progress.update(task, advance=1, error=error, doc=corpus_id[i]) print(TAB + "> Create Bag of Word... [green][DONE]") tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] with Progress( TAB + "> Create TF-IDF... [IN PROGRESS]", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console) as progress: task = progress.add_task("Loading...", total=len(corpus_tfidf), error="", doc=corpus_id[0]) for i, doc in enumerate(corpus_tfidf): error = "" with open( os.path.join(output_folder_tfidf, '{}_tfidf.txt'.format(corpus_id[i])), 'w') as file: for f, v in doc: file.write('{}:{} '.format(f, v)) progress.update(task, advance=1, error=error, doc=corpus_id[i]) print(TAB + "> Create TF-IDF... [green][DONE]")
def run(console, build, title, doc_ids=None, force=False, update=False): __console = console global print print = __console.print print(Markdown("- **Step configuration**")) input_folder = os.path.join(build, 'raw', 'preprocessed_documents') output_folder = os.path.join(build, 'raw', 'normalized_documents') ngrams_config = {} try: ngrams_config = config()['steps']['normalize']['ngrams'] except Exception as e: print('Cannot retrieve n-grams configuration. Details: {}'.format(e)) exit(5) print(TAB + '> Step folder: {}'.format(output_folder)) make_build_folder(console, output_folder, force, strict=False) files = get_files(doc_ids, input_folder) raw_corpus = [] corpus_id = [] print(Markdown('- **Load documents**')) with Progress( TAB + "> Loading in memory... [IN PROGRESS]", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console ) as progress: task = progress.add_task("Loading...", total=len(files), error="", doc=files[0].split('/')[-1].split('_text_without_conclusion.txt')[0]) for i, p in enumerate(files): error = "" doc_id = p.split('/')[-1].split('_text_without_conclusion.txt')[0] try: raw_corpus.append(load_text_file(p)) corpus_id.append(doc_id) except Exception as e: error = '\n| {}'.format('Could not load the document') log.debug(p, e) progress.update(task, advance=1, error=error, doc=doc_id) print(TAB + "> Loading in memory... [green][DONE]") normalized_tokens = [] print(Markdown('- **Generate language model**')) try: with Progress( TAB + "> Normalize... [IN PROGRESS]\n", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console ) as progress: task = progress.add_task("Compute tokens...", total=len(raw_corpus), error="", doc=corpus_id[0]) for i, doc in enumerate(raw_corpus): filename = os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])) if not update or not os.path.isfile(filename): normalized_tokens.append(normalized_step(doc, force=force, lemmatization=True)) else: with open(filename, 'r') as f: normalized_tokens.extend(f.read().split()) f.close() progress.update(task, advance=1, error=error, doc=corpus_id[i]) except Exception as e: print(TAB + '[bold red]:double_exclamation_mark: Could not normalized the tokens. Details: {}'.format(e)) exit(40) print(TAB + "> Normalize... [green][DONE]") all_grams = [] doc_grammed = [] try: with Progress( TAB + "> Compute ngrams... [IN PROGRESS]\n", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console ) as progress: task = progress.add_task("Compute tokens...", total=len(corpus_id), error="", doc=corpus_id[0]) for i, doc in enumerate(normalized_tokens): error = "" filename = os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])) if not update or not os.path.isfile(filename): grams = ngram_step(doc, ngrams_config, force=force) merged = [] for g in grams.values(): merged.extend(g) doc_grammed.append(merged) all_grams.extend(merged) else: error = "\n| Load document as already normalized." with open(filename, 'r') as f: all_grams.extend(f.read().split()) doc_grammed.append(None) f.close() progress.update(task, advance=1, error=error, doc=corpus_id[i]) except Exception: console.print_exception() print(TAB + "> Compute ngrams... [green][DONE]") f = Counter(all_grams) with open(os.path.join(output_folder, 'full_dictionary.txt'), 'w') as outfile: json.dump(f, outfile, indent=4, sort_keys=True) print(TAB + '> Save the full dictionary [green][DONE]') with Progress( TAB + "> Save normalized documents... [IN PROGRESS]\n", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console ) as progress: task = progress.add_task("Compute tokens...", total=len(doc_grammed), error="", doc=corpus_id[0]) for i, doc in enumerate(doc_grammed): if doc is not None: with open(os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])), 'a') as file: file.write(' '.join(doc)) progress.update(task, advance=1, error=error, doc=corpus_id[i]) print(TAB + '> Save normalized documents... [green][DONE]')