def main(args):
    ## infile is the output file from distributional term extraction
    txt_file_list = args[1]
    file_type = args[2]
    if not file_type.lower() in [
            '.htm', '.html', '.txt', '.hml', '.xml', '.xhtml', '.sgm', '.sgml',
            '.xhml'
    ]:
        print('Warning: File type must be a member of the list', [
            '.htm', '.html', '.txt', '.hml', '.xml', '.xhtml', '.sgm', '.sgml',
            '.xhml'
        ])
        print(
            'Halting Program. Choose a member of this list and run this function again.'
        )
        return ('Fail')
    with File(txt_file_list).openText() as instream:
        for line in instream.readlines():
            infile = line.strip()
            input_file = File(infile + file_type)
            txt2_file = File[TXT2](infile + '.txt2')
            txt3_file = File[TXT3](infile + '.txt3')
            fact_file = File[ABBR](
                infile + '.fact'
            )  # @semanticbeeng @todo static typing: is FACT same as ABBR?
            create_termolotator_fact_txt_files(input_file, txt2_file,
                                               txt3_file, fact_file)
Esempio n. 2
0
def _save_stemdict(filename: str) -> None:
    logger.debug('Saving stemming dictionary...')
    f = File(filename).openBin(mode='w')
    global stemdict
    global unstemdict
    pickle.dump((stemdict, unstemdict), f)
    f.close()
Esempio n. 3
0
def main(args):
    ## infile is the output file from distributional term extraction
    txt_file_list = args[1]
    out_file = args[2]
    ## outfile is the main output dictionary file from this process
    extensions = args[3:]

    if extensions:
        with File(txt_file_list).openText() as instream, File(
                out_file).openText(mode='w') as outstream:
            for line in instream.readlines():
                line = line.strip()
                ending_pattern = re.compile(
                    '\.((txt)|(hml)|(htm)|(html)|(xml)|(sgml))[^a-zA-Z]*$',
                    re.I)
                base = ending_pattern.sub('', line)
                if extensions[0] == 'BARE':
                    outstream.write(base)
                else:
                    outstream.write(base + extensions[0])
                for extension in extensions[1:]:
                    if extension.upper() in ['TRUE', 'FALSE', 'T', 'F']:
                        outstream.write(';' + extension)
                    elif extension.upper() == 'BARE':
                        outstream.write(base)
                    else:
                        outstream.write(';' + base + extension)
                outstream.write(os.linesep)
Esempio n. 4
0
def get_my_string_list(input_file: File) -> List[str]:
    try:
        instream = input_file.openText()
        output = instream.readlines()
    except:
        instream = input_file.openText(encoding='ISO-8859-1')
        output = instream.readlines()
    return (output)
Esempio n. 5
0
def _get_stemdict(filename: str) -> None:
    logger.debug('Loading stemming dictionary...')
    f = File(filename).openBin(mode='r')
    global stemdict
    global unstemdict
    stemdict, unstemdict = pickle.load(f)
    f.close()

    # stemdict = dictionary.freeze_dict(stemdict)           # @semanticbeeng @todo global state initialization : this fails
    unstemdict = dictionary.freeze_dict(
        unstemdict)  # @semanticbeeng @todo global state initialization
Esempio n. 6
0
def _get_stops() -> None:
    """Import stop words either from a text file or stopwords corpus"""
    global stops
    import Settings
    filename = Settings.dir_name + 'patentstops.txt'

    if filename:
        f = File(filename).openText()
        for line in f.readlines():
            stops += line.split()
        f.close()
    else:
        stops = stopwords.words('english')
Esempio n. 7
0
def find_inline_terms_for_file_list(file_list: File, dict_prefix: str = None) -> None:
    start = True
    with file_list.openText() as instream:
        # if dict_prefix:
        #     unigram_dictionary.clear()
        ## see derive_plurals in term_utilities
        ## and other instances of "unigram_dict" below
        for line in instream.readlines():
            file_prefix = line.strip()

            # @semanticbeeng @todo @dataFlow
            lines: List[str] = get_lines_from_file(
                File[TXT3](file_prefix + '.txt3'))  ## add feature to remove xml

            run_abbreviate_on_lines(lines,
                 File[ABBR](file_prefix + '.abbr'), reset_dictionary=start)       # @semanticbeeng @todo @arch global state mutation

            ## creates abbreviation files and acquires abbreviation --> term
            ## and term --> abbreviation dictionaries
            ## Possibly add alternative which loads existing abbreviation files into
            ## dictionaries for future steps (depends on timing)

            # if dict_prefix:
            #     increment_unigram_dict_from_lines(lines)

            find_inline_terms(lines,
                 File[FACT](file_prefix + '.fact'),
                 File[POS](file_prefix + '.pos'),
                 File[TERM](file_prefix + '.terms'))

            if start:
                start = False
        if dict_prefix:
            save_abbrev_dicts(File[ABBR](str(dict_prefix) + ".dict_abbr_to_full"),
                              File[ABBR](str(dict_prefix) + ".dict_full_to_abbr"))
Esempio n. 8
0
def main(args):
    file_list = args[1]
    lines = []
    with File(file_list).openText() as instream:
        for line in instream.readlines():
            fact, pos = line.strip().split(';')
            fix_bad_char_in_file(File[FACT](fact), File[POS](pos))
Esempio n. 9
0
def expand(string: str) -> str:
    """Expand abbreviations within string"""
    global abbreviations
    if not abbreviations:
        _get_abbreviations(File('./jargon.out'))
    words = string.split()
    for i in range(len(words)):
        temp = abbreviations.get(words[i])
        if temp:
            words[i] = temp
    string = _reGlue(words)
    return string
Esempio n. 10
0
 def getTerms(self, filename, filters=[], relaxed=False, overwrite=False):
     """Input file, output a FreqDist of terms"""
     filterfname = os.path.join(os.path.dirname(filename), 'filter.save')
     if os.path.exists(filename + '.nps') and os.path.exists(filterfname):
         f = File(filename + '.nps').openBin(mode='r')
         old_filters, fd = pickle.load(f)
         f.close()
         if old_filters == filters:
             if not Filter.unstemdict:
                 Filter._get_stemdict(filterfname)
             return fd
     NPs = self.getNPs(filename)
     fd = FreqDist()
     for NP in NPs:
         # get the possible terms for each NP
         terms = self.extractPossibleTerms(NP, relaxed)
         # filter each term by some given criteria
         # this requires keeping case information until this point
         # filt = Filter.Filter() # class containing all filters
         for t in terms:
             for f in filters:
                 t = Filter.criteria[f](
                     t)  # @semanticbeeng @todo global state mutation
             if t:
                 fd[t] += 1
     if overwrite or (not os.path.isfile(filename + '.nps')):
         f = File[CHUNKNPS](filename + '.nps').openBin('w')
         pickle.dump((filters, fd), f)
         f.close()
     if os.path.exists(filterfname):
         os.remove(filterfname)
     return fd
Esempio n. 11
0
def main(args):
    # global special_domains

    Refactoring.run_filter_phase = True

    file_prefix = args[1]
    web_score_dict_file = args[2]

    use_web_score: bool = None
    if args[3].lower() in ['true', 't']:
        use_web_score = True
    elif args[3].lower() in ['false', 'f']:
        use_web_score = False
    else:
        print('You set the webscore flag to', args[2],
              'but it must be either "True" or "False".')
        print(
            'Use "True" if you want the system to use the webscore function and the system will run slowly and be more accurate.'
        )
        print('Use "False" otherwise.')

    max_term_number = int(args[4])

    if (len(args) > 5) and (args[5].lower() != 'false'):
        dictionary.special_domains.extend(args[5].split(
            '+'))  # @semanticbeeng @todo @arch global state initialization

    dictionary.initialize_utilities()

    filter_terms(
        File[TERM](file_prefix + ".all_terms"),
        File(file_prefix + ".scored_output"),
        File[ABBR](file_prefix + ".dict_abbr_to_full"),
        # full_abbr_file = file_prefix + ".dict_full_to_abbr"      # @semanticbeeng not used
        use_web_score,
        numeric_cutoff=max_term_number,
        reject_file=File(file_prefix + ".rejected-terms"),
        web_score_dict_file=File(web_score_dict_file))
Esempio n. 12
0
def main(args):
    # global special_domains
    file_list = args[1]
    if len(args) > 2:
        outfile_prefix = args[2]
    else:
        outfile_prefix = False
    if (len(args) > 3) and (args[3].lower() != 'false'):
        dictionary.special_domains.extend(args[3].split(
            '+'))  # @semanticbeeng @todo @arch global state initialization

    dictionary.initialize_utilities()
    run_abbreviate_on_file_list(
        File(file_list), dict_prefix=outfile_prefix
    )  # @semanticbeeng @todo @arch global state mutation
Esempio n. 13
0
def _get_abbreviations(file: File) -> None:
    """Import abbreviations from jargon file"""
    global abbreviations

    f = file.openText()
    for line in f.readlines():
        temp = line.split('|||')
        fullword = temp[0]
        shortwords = temp[1].split('||')
        for w in shortwords:
            abbreviations[w] = fullword

    abbreviations = dictionary.freeze_dict(
        abbreviations)  # @semanticbeeng @todo global state initialization
    f.close()
Esempio n. 14
0
def load_web_score_dict_file(dict_file: File) -> None:

    global webscore_dict
    webscore_dict.clear()

    if os.path.isfile(dict_file.name):
        with dict_file.openText() as instream:
            for line in instream.readlines():
                line = line.strip(os.linesep)
                term, score = line.split('\t')
                webscore_dict[term] = float(score)

        webscore_dict = dictionary.freeze_dict(webscore_dict)       # @semanticbeeng @todo global state initialization: this fails
    else:
        print(dict_file, 'does not exist. Will be created')
Esempio n. 15
0
def main(args):
    # global special_domains

    Refactoring.run_filter_phase = False

    file_list = args[1]
    if len(args) > 2:
        outfile_prefix = args[2]
    else:
        outfile_prefix = False
    if (len(args) > 3) and (args[3].lower() != 'false'):
        dictionary.special_domains.extend(args[3].split(
            '+'))  # @semanticbeeng @todo @arch global state initialization

    dictionary.initialize_utilities()
    find_inline_terms_for_file_list(File(file_list),
                                    dict_prefix=outfile_prefix)
Esempio n. 16
0
def read_in_org_dictionary(dict_file: File,
                           dictionary: str = 'org',
                           shallow: bool = True,
                           lower: bool = False,
                           patent: bool = False) -> None:

    global organization_dictionary
    global location_dictionary
    global nationality_dictionary

    if dictionary == 'org':
        organization_dictionary.clear()
    elif dictionary == 'loc':
        location_dictionary.clear()
    elif dictionary == 'nat':
        nationality_dictionary.clear()
    elif dictionary == 'discourse':
        raise Exception('undefined variable discourse_dictionary'
                        )  # @semanticbeeng @todo dead code
        # discourse_dictionary.clear()
    elif dictionary == 'term_relation':
        raise Exception('undefined variable term_rel_dictionary'
                        )  # @semanticbeeng @todo dead code
        # term_rel_dictionary.clear()

    with dict_file.openText(mode='r') as instream:
        for line in instream.readlines():
            add_dictionary_entry(line,
                                 dictionary,
                                 shallow,
                                 lower=lower,
                                 patent=patent)

    # @semanticbeeng @arch global state immutable
    if dictionary == 'org':
        organization_dictionary = freeze_dict(organization_dictionary)
    elif dictionary == 'loc':
        location_dictionary = freeze_dict(location_dictionary)
    elif dictionary == 'nat':
        nationality_dictionary = freeze_dict(nationality_dictionary)
Esempio n. 17
0
def read_in_pos_file() -> None:
    # global pos_dict
    global pos_file
    global jargon_files

    config.pos_dict.clear()

    for line in pos_file.openText().readlines():
        line = line.strip()
        items = line.split('\t')
        config.pos_dict[items[0]] = items[1:]

    for dictionary in special_domains:
        jargon_files.append(File(dictionary_table[dictionary]))

    for jargon_file in jargon_files:
        ## remove jargon from dictionary
        with jargon_file.openText() as instream:
            for line in instream.readlines():
                word: str = line.strip()
                word = word.lower()
                if word in config.pos_dict:
                    ## pos_dict.pop(word)
                    jargon_words.add(word)
Esempio n. 18
0
import os
from utilspie.collectionsutils import frozendict
from typing import List, Dict, Tuple
from DataDef import File
import term_utilities
import config

##
#   Dictionaries
#
DICT_DIRECTORY = os.path.dirname(
    os.path.realpath(__file__)) + os.sep + "dicts" + os.sep
## DICT_DIRECTORY = '../'
## DICT_DIRECTORY = './'
ORG_DICTIONARY: File = File(DICT_DIRECTORY + 'org_dict.txt')
LOC_DICTIONARY: File = File(DICT_DIRECTORY + 'location-lisp2-ugly.dict')
#
#   @semanticbeeng not used
#
# NAT_DICTIONARY = DICT_DIRECTORY + 'nationalities.dict'
# DISC_DICTIONARY = DICT_DIRECTORY + 'discourse.dict'
# TERM_REL_DICTIONARY = DICT_DIRECTORY + 'term_relation.dict'

nom_file: File = File(DICT_DIRECTORY + 'NOMLIST.dict')
pos_file: File = File(DICT_DIRECTORY + 'POS.dict')
nom_map_file: File = File(DICT_DIRECTORY + 'nom_map.dict')
person_name_file: File = File(DICT_DIRECTORY + 'person_name_list.dict')
nat_name_file: File = File(DICT_DIRECTORY + 'nationalities_name_list.dict')
skippable_adj_file: File = File(DICT_DIRECTORY + 'out_adjectives.dict')
out_ing_file: File = File(DICT_DIRECTORY + 'out_ing.dict')
time_name_file: File = File(DICT_DIRECTORY + 'time_names.dict')