Python Fileの例、DataDef.File Pythonの例

コード例 #1

0

ファイルを表示

ファイル: make_termolator_fact_txt_files.py プロジェクト: c1a1o1/termolator-d

def main(args):
    ## infile is the output file from distributional term extraction
    txt_file_list = args[1]
    file_type = args[2]
    if not file_type.lower() in [
            '.htm', '.html', '.txt', '.hml', '.xml', '.xhtml', '.sgm', '.sgml',
            '.xhml'
    ]:
        print('Warning: File type must be a member of the list', [
            '.htm', '.html', '.txt', '.hml', '.xml', '.xhtml', '.sgm', '.sgml',
            '.xhml'
        ])
        print(
            'Halting Program. Choose a member of this list and run this function again.'
        )
        return ('Fail')
    with File(txt_file_list).openText() as instream:
        for line in instream.readlines():
            infile = line.strip()
            input_file = File(infile + file_type)
            txt2_file = File[TXT2](infile + '.txt2')
            txt3_file = File[TXT3](infile + '.txt3')
            fact_file = File[ABBR](
                infile + '.fact'
            )  # @semanticbeeng @todo static typing: is FACT same as ABBR?
            create_termolotator_fact_txt_files(input_file, txt2_file,
                                               txt3_file, fact_file)

コード例 #2

0

ファイルを表示

ファイル: Filter.py プロジェクト: c1a1o1/termolator-d

def _save_stemdict(filename: str) -> None:
    logger.debug('Saving stemming dictionary...')
    f = File(filename).openBin(mode='w')
    global stemdict
    global unstemdict
    pickle.dump((stemdict, unstemdict), f)
    f.close()

コード例 #3

0

ファイルを表示

def main(args):
    ## infile is the output file from distributional term extraction
    txt_file_list = args[1]
    out_file = args[2]
    ## outfile is the main output dictionary file from this process
    extensions = args[3:]

    if extensions:
        with File(txt_file_list).openText() as instream, File(
                out_file).openText(mode='w') as outstream:
            for line in instream.readlines():
                line = line.strip()
                ending_pattern = re.compile(
                    '\.((txt)|(hml)|(htm)|(html)|(xml)|(sgml))[^a-zA-Z]*$',
                    re.I)
                base = ending_pattern.sub('', line)
                if extensions[0] == 'BARE':
                    outstream.write(base)
                else:
                    outstream.write(base + extensions[0])
                for extension in extensions[1:]:
                    if extension.upper() in ['TRUE', 'FALSE', 'T', 'F']:
                        outstream.write(';' + extension)
                    elif extension.upper() == 'BARE':
                        outstream.write(base)
                    else:
                        outstream.write(';' + base + extension)
                outstream.write(os.linesep)

コード例 #4

0

ファイルを表示

ファイル: term_utilities.py プロジェクト: c1a1o1/termolator-d

def get_my_string_list(input_file: File) -> List[str]:
    try:
        instream = input_file.openText()
        output = instream.readlines()
    except:
        instream = input_file.openText(encoding='ISO-8859-1')
        output = instream.readlines()
    return (output)

コード例 #5

0

ファイルを表示

ファイル: Filter.py プロジェクト: c1a1o1/termolator-d

def _get_stemdict(filename: str) -> None:
    logger.debug('Loading stemming dictionary...')
    f = File(filename).openBin(mode='r')
    global stemdict
    global unstemdict
    stemdict, unstemdict = pickle.load(f)
    f.close()

    # stemdict = dictionary.freeze_dict(stemdict)           # @semanticbeeng @todo global state initialization : this fails
    unstemdict = dictionary.freeze_dict(
        unstemdict)  # @semanticbeeng @todo global state initialization

コード例 #6

0

ファイルを表示

ファイル: Filter.py プロジェクト: c1a1o1/termolator-d

def _get_stops() -> None:
    """Import stop words either from a text file or stopwords corpus"""
    global stops
    import Settings
    filename = Settings.dir_name + 'patentstops.txt'

    if filename:
        f = File(filename).openText()
        for line in f.readlines():
            stops += line.split()
        f.close()
    else:
        stops = stopwords.words('english')

コード例 #7

0

ファイルを表示

def find_inline_terms_for_file_list(file_list: File, dict_prefix: str = None) -> None:
    start = True
    with file_list.openText() as instream:
        # if dict_prefix:
        #     unigram_dictionary.clear()
        ## see derive_plurals in term_utilities
        ## and other instances of "unigram_dict" below
        for line in instream.readlines():
            file_prefix = line.strip()

            # @semanticbeeng @todo @dataFlow
            lines: List[str] = get_lines_from_file(
                File[TXT3](file_prefix + '.txt3'))  ## add feature to remove xml

            run_abbreviate_on_lines(lines,
                 File[ABBR](file_prefix + '.abbr'), reset_dictionary=start)       # @semanticbeeng @todo @arch global state mutation

            ## creates abbreviation files and acquires abbreviation --> term
            ## and term --> abbreviation dictionaries
            ## Possibly add alternative which loads existing abbreviation files into
            ## dictionaries for future steps (depends on timing)

            # if dict_prefix:
            #     increment_unigram_dict_from_lines(lines)

            find_inline_terms(lines,
                 File[FACT](file_prefix + '.fact'),
                 File[POS](file_prefix + '.pos'),
                 File[TERM](file_prefix + '.terms'))

            if start:
                start = False
        if dict_prefix:
            save_abbrev_dicts(File[ABBR](str(dict_prefix) + ".dict_abbr_to_full"),
                              File[ABBR](str(dict_prefix) + ".dict_full_to_abbr"))

コード例 #8

0

ファイルを表示

def main(args):
    file_list = args[1]
    lines = []
    with File(file_list).openText() as instream:
        for line in instream.readlines():
            fact, pos = line.strip().split(';')
            fix_bad_char_in_file(File[FACT](fact), File[POS](pos))

コード例 #9

0

ファイルを表示

ファイル: Filter.py プロジェクト: c1a1o1/termolator-d

def expand(string: str) -> str:
    """Expand abbreviations within string"""
    global abbreviations
    if not abbreviations:
        _get_abbreviations(File('./jargon.out'))
    words = string.split()
    for i in range(len(words)):
        temp = abbreviations.get(words[i])
        if temp:
            words[i] = temp
    string = _reGlue(words)
    return string

コード例 #10

0

ファイルを表示

ファイル: NPParser.py プロジェクト: c1a1o1/termolator-d

 def getTerms(self, filename, filters=[], relaxed=False, overwrite=False):
     """Input file, output a FreqDist of terms"""
     filterfname = os.path.join(os.path.dirname(filename), 'filter.save')
     if os.path.exists(filename + '.nps') and os.path.exists(filterfname):
         f = File(filename + '.nps').openBin(mode='r')
         old_filters, fd = pickle.load(f)
         f.close()
         if old_filters == filters:
             if not Filter.unstemdict:
                 Filter._get_stemdict(filterfname)
             return fd
     NPs = self.getNPs(filename)
     fd = FreqDist()
     for NP in NPs:
         # get the possible terms for each NP
         terms = self.extractPossibleTerms(NP, relaxed)
         # filter each term by some given criteria
         # this requires keeping case information until this point
         # filt = Filter.Filter() # class containing all filters
         for t in terms:
             for f in filters:
                 t = Filter.criteria[f](
                     t)  # @semanticbeeng @todo global state mutation
             if t:
                 fd[t] += 1
     if overwrite or (not os.path.isfile(filename + '.nps')):
         f = File[CHUNKNPS](filename + '.nps').openBin('w')
         pickle.dump((filters, fd), f)
         f.close()
     if os.path.exists(filterfname):
         os.remove(filterfname)
     return fd

コード例 #11

0

ファイルを表示

ファイル: filter_term_output.py プロジェクト: c1a1o1/termolator-d

def main(args):
    # global special_domains

    Refactoring.run_filter_phase = True

    file_prefix = args[1]
    web_score_dict_file = args[2]

    use_web_score: bool = None
    if args[3].lower() in ['true', 't']:
        use_web_score = True
    elif args[3].lower() in ['false', 'f']:
        use_web_score = False
    else:
        print('You set the webscore flag to', args[2],
              'but it must be either "True" or "False".')
        print(
            'Use "True" if you want the system to use the webscore function and the system will run slowly and be more accurate.'
        )
        print('Use "False" otherwise.')

    max_term_number = int(args[4])

    if (len(args) > 5) and (args[5].lower() != 'false'):
        dictionary.special_domains.extend(args[5].split(
            '+'))  # @semanticbeeng @todo @arch global state initialization

    dictionary.initialize_utilities()

    filter_terms(
        File[TERM](file_prefix + ".all_terms"),
        File(file_prefix + ".scored_output"),
        File[ABBR](file_prefix + ".dict_abbr_to_full"),
        # full_abbr_file = file_prefix + ".dict_full_to_abbr"      # @semanticbeeng not used
        use_web_score,
        numeric_cutoff=max_term_number,
        reject_file=File(file_prefix + ".rejected-terms"),
        web_score_dict_file=File(web_score_dict_file))

コード例 #12

0

ファイルを表示

def main(args):
    # global special_domains
    file_list = args[1]
    if len(args) > 2:
        outfile_prefix = args[2]
    else:
        outfile_prefix = False
    if (len(args) > 3) and (args[3].lower() != 'false'):
        dictionary.special_domains.extend(args[3].split(
            '+'))  # @semanticbeeng @todo @arch global state initialization

    dictionary.initialize_utilities()
    run_abbreviate_on_file_list(
        File(file_list), dict_prefix=outfile_prefix
    )  # @semanticbeeng @todo @arch global state mutation

コード例 #13

0

ファイルを表示

ファイル: Filter.py プロジェクト: c1a1o1/termolator-d

def _get_abbreviations(file: File) -> None:
    """Import abbreviations from jargon file"""
    global abbreviations

    f = file.openText()
    for line in f.readlines():
        temp = line.split('|||')
        fullword = temp[0]
        shortwords = temp[1].split('||')
        for w in shortwords:
            abbreviations[w] = fullword

    abbreviations = dictionary.freeze_dict(
        abbreviations)  # @semanticbeeng @todo global state initialization
    f.close()

コード例 #14

0

ファイルを表示

def load_web_score_dict_file(dict_file: File) -> None:

    global webscore_dict
    webscore_dict.clear()

    if os.path.isfile(dict_file.name):
        with dict_file.openText() as instream:
            for line in instream.readlines():
                line = line.strip(os.linesep)
                term, score = line.split('\t')
                webscore_dict[term] = float(score)

        webscore_dict = dictionary.freeze_dict(webscore_dict)       # @semanticbeeng @todo global state initialization: this fails
    else:
        print(dict_file, 'does not exist. Will be created')

コード例 #15

0

ファイルを表示

def main(args):
    # global special_domains

    Refactoring.run_filter_phase = False

    file_list = args[1]
    if len(args) > 2:
        outfile_prefix = args[2]
    else:
        outfile_prefix = False
    if (len(args) > 3) and (args[3].lower() != 'false'):
        dictionary.special_domains.extend(args[3].split(
            '+'))  # @semanticbeeng @todo @arch global state initialization

    dictionary.initialize_utilities()
    find_inline_terms_for_file_list(File(file_list),
                                    dict_prefix=outfile_prefix)

コード例 #16

0

ファイルを表示

ファイル: dictionary.py プロジェクト: c1a1o1/termolator-d

def read_in_org_dictionary(dict_file: File,
                           dictionary: str = 'org',
                           shallow: bool = True,
                           lower: bool = False,
                           patent: bool = False) -> None:

    global organization_dictionary
    global location_dictionary
    global nationality_dictionary

    if dictionary == 'org':
        organization_dictionary.clear()
    elif dictionary == 'loc':
        location_dictionary.clear()
    elif dictionary == 'nat':
        nationality_dictionary.clear()
    elif dictionary == 'discourse':
        raise Exception('undefined variable discourse_dictionary'
                        )  # @semanticbeeng @todo dead code
        # discourse_dictionary.clear()
    elif dictionary == 'term_relation':
        raise Exception('undefined variable term_rel_dictionary'
                        )  # @semanticbeeng @todo dead code
        # term_rel_dictionary.clear()

    with dict_file.openText(mode='r') as instream:
        for line in instream.readlines():
            add_dictionary_entry(line,
                                 dictionary,
                                 shallow,
                                 lower=lower,
                                 patent=patent)

    # @semanticbeeng @arch global state immutable
    if dictionary == 'org':
        organization_dictionary = freeze_dict(organization_dictionary)
    elif dictionary == 'loc':
        location_dictionary = freeze_dict(location_dictionary)
    elif dictionary == 'nat':
        nationality_dictionary = freeze_dict(nationality_dictionary)

コード例 #17

0

ファイルを表示

ファイル: dictionary.py プロジェクト: c1a1o1/termolator-d

def read_in_pos_file() -> None:
    # global pos_dict
    global pos_file
    global jargon_files

    config.pos_dict.clear()

    for line in pos_file.openText().readlines():
        line = line.strip()
        items = line.split('\t')
        config.pos_dict[items[0]] = items[1:]

    for dictionary in special_domains:
        jargon_files.append(File(dictionary_table[dictionary]))

    for jargon_file in jargon_files:
        ## remove jargon from dictionary
        with jargon_file.openText() as instream:
            for line in instream.readlines():
                word: str = line.strip()
                word = word.lower()
                if word in config.pos_dict:
                    ## pos_dict.pop(word)
                    jargon_words.add(word)

コード例 #18

0

ファイルを表示

ファイル: dictionary.py プロジェクト: c1a1o1/termolator-d

import os
from utilspie.collectionsutils import frozendict
from typing import List, Dict, Tuple
from DataDef import File
import term_utilities
import config

##
#   Dictionaries
#
DICT_DIRECTORY = os.path.dirname(
    os.path.realpath(__file__)) + os.sep + "dicts" + os.sep
## DICT_DIRECTORY = '../'
## DICT_DIRECTORY = './'
ORG_DICTIONARY: File = File(DICT_DIRECTORY + 'org_dict.txt')
LOC_DICTIONARY: File = File(DICT_DIRECTORY + 'location-lisp2-ugly.dict')
#
#   @semanticbeeng not used
#
# NAT_DICTIONARY = DICT_DIRECTORY + 'nationalities.dict'
# DISC_DICTIONARY = DICT_DIRECTORY + 'discourse.dict'
# TERM_REL_DICTIONARY = DICT_DIRECTORY + 'term_relation.dict'

nom_file: File = File(DICT_DIRECTORY + 'NOMLIST.dict')
pos_file: File = File(DICT_DIRECTORY + 'POS.dict')
nom_map_file: File = File(DICT_DIRECTORY + 'nom_map.dict')
person_name_file: File = File(DICT_DIRECTORY + 'person_name_list.dict')
nat_name_file: File = File(DICT_DIRECTORY + 'nationalities_name_list.dict')
skippable_adj_file: File = File(DICT_DIRECTORY + 'out_adjectives.dict')
out_ing_file: File = File(DICT_DIRECTORY + 'out_ing.dict')
time_name_file: File = File(DICT_DIRECTORY + 'time_names.dict')