def get_abbreviation_dict(sentences): if type(sentences) is list: abbreviation_dict = schwartz_hearst.extract_abbreviation_definition_pairs( doc_text=" ".join(sentences), most_common_definition=True) elif type(sentences) is str: abbreviation_dict = schwartz_hearst.extract_abbreviation_definition_pairs( doc_text=sentences, most_common_definition=True) return abbreviation_dict
def get_abbreviations(text): """ Get list of abbreviations in sentence """ pairs = schwartz_hearst.extract_abbreviation_definition_pairs( doc_text=text) return pairs
def extractAcronymsAndPhrases(text): pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=text) phraseAcronymLinks = {} for acronym, phrase in pairs.items(): print(phrase) phraseAcronymLinks[toLemmas(str(phrase))] = acronym.lower() return phraseAcronymLinks
def run_schwartz_algorithm(): global db_acronyms for file in os.listdir("./original_text"): pairs = schwartz_hearst.extract_abbreviation_definition_pairs( file_path="./original_text/" + file) result = {'document_id': file, 'acronyms': []} for key, value in pairs.items(): result['acronyms'].append({'acronym': key, 'full_form': value}) insert_acronym(file, key, value) db_acronyms.append(result)
def find_abbreviations(text_docs: List[str]) -> Dict[str, str]: ''' This method is used to find the list of abbreviations in the document. It returns a dictionary of type - abbr. : full_form ''' pairs = {} for doc in text_docs: found = schwartz_hearst.extract_abbreviation_definition_pairs( doc_text=doc, most_common_definition=True) pairs.update(found) return pairs
def get_abbreviations(): count = 1 for case in all_cases: if case[0] == ".": continue with open("{}/All_FT/{}".format(ENV["DATASET_PATH"], case), 'r') as file: count += 1 # print(count) file_content = file.read() pairs = schwartz_hearst.extract_abbreviation_definition_pairs( doc_text=file_content) for pair in pairs: flag = 0 upper_case, lower_case = function(pair) if (lower_case == 0): flag = 1 words = pairs[pair].split(' ') upper_case = 0 lower_case = 0 for word in words: upper_case_1, lower_case_1 = function(word) upper_case += upper_case_1 lower_case += lower_case_1 if lower_case == 0 or upper_case / lower_case > 0.8: flag = 1 if flag == 0: if pair not in ignore: ignore[pair] = [] ignore[pair].append(pairs[pair]) else: if pairs[pair] not in ignore[pair]: ignore[pair].append((pairs[pair])) else: if pair not in abb: abb[pair] = [] abb[pair].append(pairs[pair]) else: if pairs[pair] not in abb[pair]: abb[pair].append((pairs[pair]))
def main(file,mode): dict_abbr={} head=False t=[] for l in preprocess_file(file): i=l.split('\t') if head==False: head=True else: if len(i)<1: import pdb; pdb.set_trace() continue else: t.append(i) '''document+="\n"+i[3] ''' dict_abbr=schwartz_hearst.extract_abbreviation_definition_pairs(tagged_text=t) return dict_abbr
def extract_expansions(acronyms, use_cached=True): print('Extracting expansions from Pubmed...') out_fn = './data/derived/pubmed_acronym_expansions.json' # TODO use_cached = True because script is not runnable right now use_cached = True if use_cached and os.path.exists(out_fn): return out_fn lt_files = os.listdir('./abstracts') acronyms = collections.defaultdict(list) for fname in lt_files: if fname.endswith('.txt'): pairs = schwartz_hearst.extract_abbreviation_definition_pairs( file_path='./abstracts/' + fname) for acronym, expansion in pairs.items(): acronyms[acronym].append(expansion) with open(out_fn, 'w') as f: json.dump(acronyms, f) return out_fn
def step_impl(context): context.result = schwartz_hearst.extract_abbreviation_definition_pairs( doc_text=context.text)
#open the pdf object root_dir = '/home/nightingale/Documents/jeff_dev/Hobby_Projects/' accro_dict = {} #for filename in glob.iglob(root_dir + '**/*.pdf', recursive=True): for filename in glob.iglob(root_dir + ('**/*.pdf' or '**/*.doc'), recursive=True): if '.doc' or '.pdf' in filename: pdf_obj = open(filename, 'rb') pdf_reader = PyPDF2.PdfFileReader(pdf_obj) for (page_number) in range(pdf_reader.numPages): page_obj = pdf_reader.getPage(page_number) text = page_obj.extractText() pairs = schwartz_hearst.extract_abbreviation_definition_pairs( doc_text=text, most_common_definition=True) accro_dict.update(pairs) print(accro_dict) df = pd.DataFrame() df['ACRONYM'] = accro_dict.keys() df['MEANING'] = accro_dict.values() df.to_csv('example1.csv', index=False) #break # By default, the most recently encountered definition for each term is returned #pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text= text) # pairs = schwartz_hearst.extract_abbreviation_definition_pairs(file_path='<path_to_file>') # # If multiple definitions are encountered for each term, you might want to return the most common for each # pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text='...', most_common_definition=True)