コード例 #1
0
ファイル: posextractor.py プロジェクト: rforge/sigil
 def process(self,file):
     feats = {}
     Extractor.process(self,file)
     ir = InputReader(file)
     ir.read()
     cqpf = CQPFormat(ir.getText())
     pos = cqpf.getColumn(1)
     # initialize counts
     
     for name in self.posnames:
         feats[name] = 0
     
     for i in range(2,len(pos)): # ignore first two pos ...
         uni =  (pos[i])[0:3]
         bi = (pos[i-1])[0:3] + "_" + uni
         tri = (pos[i-2])[0:3] + "_" + bi
         if uni in feats:
             feats[uni] += 1
         if bi in feats:
             feats[bi] += 1
         if tri in feats:
             feats[tri] += 1
         
     for x in self.posnames:
         feats[x] /= float(len(pos)-2)
     
     return ir.getID(),feats
コード例 #2
0
ファイル: statextractor.py プロジェクト: rforge/sigil
 def process(self,file):
     feats = {}
     Extractor.process(self,file)
     ir = InputReader(file)
     ir.read()
     cqpf = CQPFormat(ir.getText())
     #words = ' '.join(cqpf.getColumn(0))
     #pos = ' '.join(self.disambiguatePOS(cqpf.getColumn(1)))
     lemma = cqpf.getColumn(2)
     sentences = cqpf.getAnnotations("s")
     wordpostmp = []
     for (start,end,attr) in sentences:
         wordpostmp.append('<s>')
         wordpostmp.extend(self.getWordsWithPOS(
                             cqpf.getColumn(0)[start:end],
                             self.disambiguatePOS(cqpf.getColumn(1)[start:end])))
         wordpostmp.append('</s> ')
     wordpos = ' '.join(wordpostmp)
     feats.update(self.extractWithREs(self.DIRECT_FEATS,wordpos))
     feats.update(self.extractWithREs(self.CALC_FEATS,wordpos))
     feats.update(self.extractFromLemmatatizedForms(self.LEMMA_FEATS,lemma))
     self.calculateFeats(feats)
     self.normalizeByLength(feats, len(lemma))
     feats.update(self.extractStatistics(cqpf))
     print feats
     return ir.getID(),feats
コード例 #3
0
def main():
    # Parse the arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('-c',
                        '--config',
                        type=str,
                        metavar='config_file',
                        help='The path for the config file',
                        default='config.ini')
    args = parser.parse_args()

    # Read the config file options
    config = configparser.ConfigParser()
    config.read(args.config)

    # Initialize the classes
    api = APIAccess(
        config['LANGUAGE']['L2'], config['LANGUAGE']['L1'],
        config.getboolean('EXAMPLE_SENTENCES', 'PreferLongSentences'),
        config.getboolean('EXAMPLE_SENTENCES', 'Cloze'))
    input_reader = InputReader(config['INPUT']['Mode'],
                               config['INPUT']['FileName'])
    output_writer = OutputWriter(config['OUTPUT'])
    audio = AudioManager(config['LANGUAGE']['L2'], config['AUDIO']['Folder'],
                         config.getboolean('AUDIO', 'Normalize'))

    # Read the input file and get all the distinct meanings for each
    # word, then append them to the output file
    failed = []
    for word in input_reader.get_next_word():
        try:
            results = api.get_dict_info(word)

            # Try to find the audio file for the word and update the results
            audio_file = audio.get_audio(word)
            for result in results:
                result['Pronounciation'] = audio_file

            output_writer.write_output(results)
        except PermissionError as e:
            # Happens if the Lexicala API doesn't allow connection
            print(e)
            print('Terminating the Program')
            exit(0)
        except Exception as e:
            # Shouldn't happen normally
            raise e

        # Book keeping and printing
        if not len(results):
            failed.append(word)

        print(f'{word} finished with {len(results)} results')

    print(f'Failed: {failed}')
コード例 #4
0
 def process(self,file):
     feats = {}
     Extractor.process(self,file)
     ir = InputReader(file)
     ir.read()
     cqpf = CQPFormat(ir.getText())
     lengths = [end-start for (start,end,arg) in cqpf.getAnnotations("s")]
     print self.__featureNames
     feats = utils.getStats("SENT_LENGTH", lengths)
     return ir.getID(),feats