def main(args): ## infile is the output file from distributional term extraction txt_file_list = args[1] file_type = args[2] if not file_type.lower() in [ '.htm', '.html', '.txt', '.hml', '.xml', '.xhtml', '.sgm', '.sgml', '.xhml' ]: print('Warning: File type must be a member of the list', [ '.htm', '.html', '.txt', '.hml', '.xml', '.xhtml', '.sgm', '.sgml', '.xhml' ]) print( 'Halting Program. Choose a member of this list and run this function again.' ) return ('Fail') with File(txt_file_list).openText() as instream: for line in instream.readlines(): infile = line.strip() input_file = File(infile + file_type) txt2_file = File[TXT2](infile + '.txt2') txt3_file = File[TXT3](infile + '.txt3') fact_file = File[ABBR]( infile + '.fact' ) # @semanticbeeng @todo static typing: is FACT same as ABBR? create_termolotator_fact_txt_files(input_file, txt2_file, txt3_file, fact_file)
def _save_stemdict(filename: str) -> None: logger.debug('Saving stemming dictionary...') f = File(filename).openBin(mode='w') global stemdict global unstemdict pickle.dump((stemdict, unstemdict), f) f.close()
def main(args): ## infile is the output file from distributional term extraction txt_file_list = args[1] out_file = args[2] ## outfile is the main output dictionary file from this process extensions = args[3:] if extensions: with File(txt_file_list).openText() as instream, File( out_file).openText(mode='w') as outstream: for line in instream.readlines(): line = line.strip() ending_pattern = re.compile( '\.((txt)|(hml)|(htm)|(html)|(xml)|(sgml))[^a-zA-Z]*$', re.I) base = ending_pattern.sub('', line) if extensions[0] == 'BARE': outstream.write(base) else: outstream.write(base + extensions[0]) for extension in extensions[1:]: if extension.upper() in ['TRUE', 'FALSE', 'T', 'F']: outstream.write(';' + extension) elif extension.upper() == 'BARE': outstream.write(base) else: outstream.write(';' + base + extension) outstream.write(os.linesep)
def get_my_string_list(input_file: File) -> List[str]: try: instream = input_file.openText() output = instream.readlines() except: instream = input_file.openText(encoding='ISO-8859-1') output = instream.readlines() return (output)
def _get_stemdict(filename: str) -> None: logger.debug('Loading stemming dictionary...') f = File(filename).openBin(mode='r') global stemdict global unstemdict stemdict, unstemdict = pickle.load(f) f.close() # stemdict = dictionary.freeze_dict(stemdict) # @semanticbeeng @todo global state initialization : this fails unstemdict = dictionary.freeze_dict( unstemdict) # @semanticbeeng @todo global state initialization
def _get_stops() -> None: """Import stop words either from a text file or stopwords corpus""" global stops import Settings filename = Settings.dir_name + 'patentstops.txt' if filename: f = File(filename).openText() for line in f.readlines(): stops += line.split() f.close() else: stops = stopwords.words('english')
def find_inline_terms_for_file_list(file_list: File, dict_prefix: str = None) -> None: start = True with file_list.openText() as instream: # if dict_prefix: # unigram_dictionary.clear() ## see derive_plurals in term_utilities ## and other instances of "unigram_dict" below for line in instream.readlines(): file_prefix = line.strip() # @semanticbeeng @todo @dataFlow lines: List[str] = get_lines_from_file( File[TXT3](file_prefix + '.txt3')) ## add feature to remove xml run_abbreviate_on_lines(lines, File[ABBR](file_prefix + '.abbr'), reset_dictionary=start) # @semanticbeeng @todo @arch global state mutation ## creates abbreviation files and acquires abbreviation --> term ## and term --> abbreviation dictionaries ## Possibly add alternative which loads existing abbreviation files into ## dictionaries for future steps (depends on timing) # if dict_prefix: # increment_unigram_dict_from_lines(lines) find_inline_terms(lines, File[FACT](file_prefix + '.fact'), File[POS](file_prefix + '.pos'), File[TERM](file_prefix + '.terms')) if start: start = False if dict_prefix: save_abbrev_dicts(File[ABBR](str(dict_prefix) + ".dict_abbr_to_full"), File[ABBR](str(dict_prefix) + ".dict_full_to_abbr"))
def main(args): file_list = args[1] lines = [] with File(file_list).openText() as instream: for line in instream.readlines(): fact, pos = line.strip().split(';') fix_bad_char_in_file(File[FACT](fact), File[POS](pos))
def expand(string: str) -> str: """Expand abbreviations within string""" global abbreviations if not abbreviations: _get_abbreviations(File('./jargon.out')) words = string.split() for i in range(len(words)): temp = abbreviations.get(words[i]) if temp: words[i] = temp string = _reGlue(words) return string
def getTerms(self, filename, filters=[], relaxed=False, overwrite=False): """Input file, output a FreqDist of terms""" filterfname = os.path.join(os.path.dirname(filename), 'filter.save') if os.path.exists(filename + '.nps') and os.path.exists(filterfname): f = File(filename + '.nps').openBin(mode='r') old_filters, fd = pickle.load(f) f.close() if old_filters == filters: if not Filter.unstemdict: Filter._get_stemdict(filterfname) return fd NPs = self.getNPs(filename) fd = FreqDist() for NP in NPs: # get the possible terms for each NP terms = self.extractPossibleTerms(NP, relaxed) # filter each term by some given criteria # this requires keeping case information until this point # filt = Filter.Filter() # class containing all filters for t in terms: for f in filters: t = Filter.criteria[f]( t) # @semanticbeeng @todo global state mutation if t: fd[t] += 1 if overwrite or (not os.path.isfile(filename + '.nps')): f = File[CHUNKNPS](filename + '.nps').openBin('w') pickle.dump((filters, fd), f) f.close() if os.path.exists(filterfname): os.remove(filterfname) return fd
def main(args): # global special_domains Refactoring.run_filter_phase = True file_prefix = args[1] web_score_dict_file = args[2] use_web_score: bool = None if args[3].lower() in ['true', 't']: use_web_score = True elif args[3].lower() in ['false', 'f']: use_web_score = False else: print('You set the webscore flag to', args[2], 'but it must be either "True" or "False".') print( 'Use "True" if you want the system to use the webscore function and the system will run slowly and be more accurate.' ) print('Use "False" otherwise.') max_term_number = int(args[4]) if (len(args) > 5) and (args[5].lower() != 'false'): dictionary.special_domains.extend(args[5].split( '+')) # @semanticbeeng @todo @arch global state initialization dictionary.initialize_utilities() filter_terms( File[TERM](file_prefix + ".all_terms"), File(file_prefix + ".scored_output"), File[ABBR](file_prefix + ".dict_abbr_to_full"), # full_abbr_file = file_prefix + ".dict_full_to_abbr" # @semanticbeeng not used use_web_score, numeric_cutoff=max_term_number, reject_file=File(file_prefix + ".rejected-terms"), web_score_dict_file=File(web_score_dict_file))
def main(args): # global special_domains file_list = args[1] if len(args) > 2: outfile_prefix = args[2] else: outfile_prefix = False if (len(args) > 3) and (args[3].lower() != 'false'): dictionary.special_domains.extend(args[3].split( '+')) # @semanticbeeng @todo @arch global state initialization dictionary.initialize_utilities() run_abbreviate_on_file_list( File(file_list), dict_prefix=outfile_prefix ) # @semanticbeeng @todo @arch global state mutation
def _get_abbreviations(file: File) -> None: """Import abbreviations from jargon file""" global abbreviations f = file.openText() for line in f.readlines(): temp = line.split('|||') fullword = temp[0] shortwords = temp[1].split('||') for w in shortwords: abbreviations[w] = fullword abbreviations = dictionary.freeze_dict( abbreviations) # @semanticbeeng @todo global state initialization f.close()
def load_web_score_dict_file(dict_file: File) -> None: global webscore_dict webscore_dict.clear() if os.path.isfile(dict_file.name): with dict_file.openText() as instream: for line in instream.readlines(): line = line.strip(os.linesep) term, score = line.split('\t') webscore_dict[term] = float(score) webscore_dict = dictionary.freeze_dict(webscore_dict) # @semanticbeeng @todo global state initialization: this fails else: print(dict_file, 'does not exist. Will be created')
def main(args): # global special_domains Refactoring.run_filter_phase = False file_list = args[1] if len(args) > 2: outfile_prefix = args[2] else: outfile_prefix = False if (len(args) > 3) and (args[3].lower() != 'false'): dictionary.special_domains.extend(args[3].split( '+')) # @semanticbeeng @todo @arch global state initialization dictionary.initialize_utilities() find_inline_terms_for_file_list(File(file_list), dict_prefix=outfile_prefix)
def read_in_org_dictionary(dict_file: File, dictionary: str = 'org', shallow: bool = True, lower: bool = False, patent: bool = False) -> None: global organization_dictionary global location_dictionary global nationality_dictionary if dictionary == 'org': organization_dictionary.clear() elif dictionary == 'loc': location_dictionary.clear() elif dictionary == 'nat': nationality_dictionary.clear() elif dictionary == 'discourse': raise Exception('undefined variable discourse_dictionary' ) # @semanticbeeng @todo dead code # discourse_dictionary.clear() elif dictionary == 'term_relation': raise Exception('undefined variable term_rel_dictionary' ) # @semanticbeeng @todo dead code # term_rel_dictionary.clear() with dict_file.openText(mode='r') as instream: for line in instream.readlines(): add_dictionary_entry(line, dictionary, shallow, lower=lower, patent=patent) # @semanticbeeng @arch global state immutable if dictionary == 'org': organization_dictionary = freeze_dict(organization_dictionary) elif dictionary == 'loc': location_dictionary = freeze_dict(location_dictionary) elif dictionary == 'nat': nationality_dictionary = freeze_dict(nationality_dictionary)
def read_in_pos_file() -> None: # global pos_dict global pos_file global jargon_files config.pos_dict.clear() for line in pos_file.openText().readlines(): line = line.strip() items = line.split('\t') config.pos_dict[items[0]] = items[1:] for dictionary in special_domains: jargon_files.append(File(dictionary_table[dictionary])) for jargon_file in jargon_files: ## remove jargon from dictionary with jargon_file.openText() as instream: for line in instream.readlines(): word: str = line.strip() word = word.lower() if word in config.pos_dict: ## pos_dict.pop(word) jargon_words.add(word)
import os from utilspie.collectionsutils import frozendict from typing import List, Dict, Tuple from DataDef import File import term_utilities import config ## # Dictionaries # DICT_DIRECTORY = os.path.dirname( os.path.realpath(__file__)) + os.sep + "dicts" + os.sep ## DICT_DIRECTORY = '../' ## DICT_DIRECTORY = './' ORG_DICTIONARY: File = File(DICT_DIRECTORY + 'org_dict.txt') LOC_DICTIONARY: File = File(DICT_DIRECTORY + 'location-lisp2-ugly.dict') # # @semanticbeeng not used # # NAT_DICTIONARY = DICT_DIRECTORY + 'nationalities.dict' # DISC_DICTIONARY = DICT_DIRECTORY + 'discourse.dict' # TERM_REL_DICTIONARY = DICT_DIRECTORY + 'term_relation.dict' nom_file: File = File(DICT_DIRECTORY + 'NOMLIST.dict') pos_file: File = File(DICT_DIRECTORY + 'POS.dict') nom_map_file: File = File(DICT_DIRECTORY + 'nom_map.dict') person_name_file: File = File(DICT_DIRECTORY + 'person_name_list.dict') nat_name_file: File = File(DICT_DIRECTORY + 'nationalities_name_list.dict') skippable_adj_file: File = File(DICT_DIRECTORY + 'out_adjectives.dict') out_ing_file: File = File(DICT_DIRECTORY + 'out_ing.dict') time_name_file: File = File(DICT_DIRECTORY + 'time_names.dict')