def ComputeDF(pathToCollectionOfDocs, lang, normalization, pathToDFFile): """Compute Document Frequency (DF) counts from a collection of documents. N-grams up to 3-grams are extracted and converted to their n-stems forms. Those containing a token that occurs in a stoplist are filtered out. Output file is in compressed (gzip) tab-separated-values format (tsv.gz). """ # path to the collection of documents print( f"DF will be computed on top of the following collection of docs: {pathToCollectionOfDocs}" ) if os.path.exists(pathToDFFile): print(f"DF Model already exists here: {pathToDFFile} ") else: print( f"DF Model doesn't exist. It will be created (and may take a while) and will be saved here: {pathToDFFile}" ) stoplist = list(string.punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] stoplist += load_stop_words(lang) compute_document_frequency(pathToCollectionOfDocs, pathToDFFile, extension='txt', language=lang, normalization=normalization, stoplist=stoplist)
def compute_document_frequency(input_file): #stopwords = list(punctuation) pke.compute_document_frequency( input_dir=input_file, output_file='trial_doc_freq.tsv.gz', extension='txt', # input file extension language='en', # language of files normalization="stemming") #, # use porter stemmer
def main(): #process the document frequency of the reference corpus """Compute Document Frequency (DF) counts from a collection of documents. N-grams up to 3-grams are extracted and converted to their n-stems forms. Those containing a token that occurs in a stoplist are filtered out. Output file is in compressed (gzip) tab-separated-values format (tsv.gz). """ # stoplist for filtering n-grams stoplist = list(punctuation) # compute df counts and store as n-stem -> weight values compute_document_frequency( input_dir= '/Users/gmt28/Documents/Workspace/Docker_Engine/varad/Yale_Projects/shoah-foundation-data-restored/shoah-foundation-data/data/inputs/fortunoff/transcripts/', output_file= '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/output.tsv.gz', extension='txt', # input file extension language='en', # language of files normalization=None, # use porter stemmer stoplist=stoplist, n=1) pdb.set_trace() """Keyphrase extraction using TfIdf and newly computed DF counts.""" # initialize TfIdf model extractor = pke.unsupervised.TfIdf() # load the DF counts from file df_counts = pke.load_document_frequency_file( input_file= '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/output.tsv.gz' ) # load the content of the document extractor.load_document( input= '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/text.txt', normalization=None, language='en') # keyphrase candidate selection extractor.candidate_selection(n=1, stoplist=list(string.punctuation)) # candidate weighting with the provided DF counts extractor.candidate_weighting(df=df_counts) # N-best selection, keyphrases contains the 10 highest scored candidates as # (keyphrase, score) tuples keyphrases = extractor.get_n_best(n=15) print(keyphrases) pdb.set_trace()
def try_export_jsonl(): n = 10 # snlp_folder = "../data/processed/news/relevant/train/" snlp_folder = "../data/processed/news/relevant/train/" compute_document_frequency( snlp_folder, os.path.join("../data/interim/news_cargo_df.tsv.gz"), stoplist=list(STOP_WORDS)) cargo_df = load_document_frequency_file( "../data/interim/news_cargo_df.tsv.gz") pke_factory = { "grammar": r""" NBAR: {<NOUN|PROPN|NUM|ADJ>*<NOUN|PROPN>} NP: {<NBAR>} {<NBAR><ADP><NBAR>} """, "filtering_params": { "stoplist": list(STOP_WORDS) }, "extractors": { "kpm": { "instance": PKEBasedTermsExtractor(KPMiner), "weighting_params": { "df": cargo_df } }, } } for name in pke_factory["extractors"]: log.info(f"Begin Extraction with PKE based extractor: {name}") extractor_instance = pke_factory["extractors"][name]["instance"] if "filtering_params" in pke_factory["extractors"][name]: filtering_params = { **pke_factory["filtering_params"], **pke_factory["extractors"][name]["filtering_params"] } else: filtering_params = pke_factory["filtering_params"] extractor_instance.extract( snlp_folder, n, grammar=pke_factory["grammar"], filtering_params=filtering_params, weighting_params=pke_factory["extractors"][name] ["weighting_params"], output_file=f"../results/extracted_terms/train/{name}.csv", auto_term_file=f"../data/annotations/automatic/terms/{name}.jsonl")
def train_word_frequency(): # stoplist for filtering n-grams stoplist = list(string.punctuation) # stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] # stoplist += stopwords.words('english') # compute df counts and store as n-stem -> weight values pke.compute_document_frequency( input_dir='../scratch/lda_text', output_file='../scratch/tf_abs_2.tsv.gz', extension='txt', # input file extension language='en', # language of files normalization="stemming", # use porter stemmer stoplist=stoplist)
# -*- coding: utf-8 -*- import logging import sys from string import punctuation from pke import compute_document_frequency # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = sys.argv[1] # path to the df weights dictionary, saved as a gzipped csv file output_file = sys.argv[2] # stoplist are punctuation marks stoplist = list(punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] # compute idf weights compute_document_frequency( input_dir=input_dir, output_file=output_file, extension='xml', # input file extension language='en', # language of the input files normalization="stemming", # use porter stemmer stoplist=stoplist, # stoplist delimiter='\t', # tab separated output n=5) # compute n-grams up to 5-grams
import os import logging import sys from pke import compute_document_frequency from string import punctuation # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = sys.argv[1] # path to the df weights dictionary, saved as a gzipped csv file output_file = sys.argv[2] # stoplist are punctuation marks stoplist = list(punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] # compute idf weights compute_document_frequency(input_dir=input_dir, output_file=output_file, format="corenlp", # input files format use_lemmas=False, # do not use Stanford lemmas stemmer="porter", # use porter stemmer stoplist=stoplist, # stoplist delimiter='\t', # tab separated output extension='xml', # input files extension n=5) # compute n-grams up to 5-grams
def run_trial(): n = 10 snlp_folder = "../data/test/core_nlp_samples" compute_document_frequency( snlp_folder, os.path.join("../data/test/interim/test_cargo_df.tsv.gz"), stoplist=list(STOP_WORDS)) cargo_df = load_document_frequency_file( "../data/test/interim/test_cargo_df.tsv.gz") pke_factory = { "grammar": r""" NBAR: {<NOUN|PROPN|NUM|ADJ>*<NOUN|PROPN>} NP: {<NBAR>} {<NBAR><ADP><NBAR>} """, "filtering_params": { "stoplist": list(STOP_WORDS) }, "extractors": { "tfidf": { "instance": PKEBasedTermsExtractor(TfIdf), "weighting_params": { "df": cargo_df } }, "yake": { "instance": PKEBasedTermsExtractor(YAKE), "filtering_params": { "only_alphanum": True, "strip_outer_stopwords": True }, "weighting_params": { "stoplist": list(STOP_WORDS) } }, "kpm": { "instance": PKEBasedTermsExtractor(KPMiner), "weighting_params": { "df": cargo_df } }, "mprank": { "instance": PKEBasedTermsExtractor(MultipartiteRank), "weighting_params": {} }, "positionrank": { "instance": PKEBasedTermsExtractor(PositionRank), "weighting_params": {} } } } for name in pke_factory["extractors"]: log.info(f"Begin Extraction with PKE based extractor: {name}") extractor_instance = pke_factory["extractors"][name]["instance"] if "filtering_params" in pke_factory["extractors"][name]: filtering_params = { **pke_factory["filtering_params"], **pke_factory["extractors"][name]["filtering_params"] } else: filtering_params = pke_factory["filtering_params"] extractor_instance.extract( snlp_folder, n, grammar=pke_factory["grammar"], filtering_params=filtering_params, weighting_params=pke_factory["extractors"][name] ["weighting_params"], output_file=f"../data/test/extracted_terms_sample/{name}.csv", auto_term_file=f"../data/test/automatic_annotations/{name}.jsonl")
import logging import sys from pke import compute_document_frequency from string import punctuation # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = "/home/asjindal/Work/tf/keyword_extraction/resources/data/docs" # path to the df weights dictionary, saved as a gzipped csv file output_file = "/home/asjindal/Work/tf/keyword_extraction/resources/data/train_df_count.tsv.gz" # stoplist are punctuation marks stoplist = list(punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] # compute idf weights compute_document_frequency( input_dir=input_dir, output_file=output_file, format="raw", # input files format use_lemmas=False, # do not use Stanford lemmas stemmer=None, # use porter stemmer stoplist=stoplist, # stoplist delimiter='\t', # tab separated output extension='txt', # input files extension n=3) # compute n-grams up to 5-grams
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] # ====================================================================================================================== # Compute document frequency for SemEval # ====================================================================================================================== """Compute Document Frequency (DF) counts from a collection of documents. N-grams up to 3-grams are extracted and converted to their n-stems forms. Those containing a token that occurs in a stoplist are filtered out. Output file is in compressed (gzip) tab-separated-values format (tsv.gz). """ compute_document_frequency( input_dir='../data/benchmark_data/semeval_2010/train_test_combined/', output_file='doc_freq/semeval_2010_doc_freq.tsv.gz', extension='xml', # input file extension language='en', # language of files normalization="stemming", # use porter stemmer stoplist=stoplist) # ====================================================================================================================== # Compute document frequency for SemEval # ====================================================================================================================== file = '..\\data\\benchmark_data\\NUS.json' # TEST data to evaluate the final model json_data = [] for line in open(file, 'r', encoding="utf8"): json_data.append(json.loads(line)) # convert json to dataframe
ends = [ int(u.text) for u in sentence.iterfind('tokens/token/CharacterOffsetEnd') ] doc = { 'words': [u.text for u in sentence.iterfind('tokens/token/word')], 'lemmas': [u.text for u in sentence.iterfind('tokens/token/lemma')], 'POS': [u.text for u in sentence.iterfind('tokens/token/POS')], 'char_offsets': [(starts[k], ends[k]) for k in range(len(starts))] } sentences.append([(doc['words'][i], doc['POS'][i]) for i in range(len(doc['words']))]) return sentences documents = [] for fn in glob(input_dir + '*.xml'): doc = read_corenlp_xml(fn) documents.append(doc) # compute idf weights compute_document_frequency( documents, output_file=output_file, language='en', # language of the input files normalization='stemming', # use porter stemmer stoplist=stoplist, # stoplist n=5 # compute n-grams up to 5-grams )
############################################################################### ############################################################################### # PRE-COMPUTING WEIGHTS/STATS ############################################################################### # pre-compute DF weights if needed need_df = any(model in ['KPMiner', 'Wingnus', 'TfIdf', 'Kea'] for model in params['models']) if need_df and not os.path.isfile(path_to_df_file): logging.info("computing DF weights from {}".format(params["path"])) pke.compute_document_frequency(input_dir=path_to_train, output_file=path_to_df_file, extension=params["extension"], language=params["language"], normalization=params["normalization"], stoplist=punctuations, delimiter='\t', n=5) # pre-compute LDA distributions if needed need_lda = any(model in ['TopicalPageRank'] for model in params['models']) if need_lda and not os.path.isfile(path_to_lda_file): logging.info("computing LDA distributions from {}".format(params["path"])) pke.compute_lda_model(input_dir=path_to_train, output_file=path_to_lda_file, n_topics=params["n_topics"], extension=params["extension"], language=params["language"], normalization=params["normalization"])