def process(self,file): feats = {} Extractor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) pos = cqpf.getColumn(1) # initialize counts for name in self.posnames: feats[name] = 0 for i in range(2,len(pos)): # ignore first two pos ... uni = (pos[i])[0:3] bi = (pos[i-1])[0:3] + "_" + uni tri = (pos[i-2])[0:3] + "_" + bi if uni in feats: feats[uni] += 1 if bi in feats: feats[bi] += 1 if tri in feats: feats[tri] += 1 for x in self.posnames: feats[x] /= float(len(pos)-2) return ir.getID(),feats
def process(self,file): feats = {} Extractor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) #words = ' '.join(cqpf.getColumn(0)) #pos = ' '.join(self.disambiguatePOS(cqpf.getColumn(1))) lemma = cqpf.getColumn(2) sentences = cqpf.getAnnotations("s") wordpostmp = [] for (start,end,attr) in sentences: wordpostmp.append('<s>') wordpostmp.extend(self.getWordsWithPOS( cqpf.getColumn(0)[start:end], self.disambiguatePOS(cqpf.getColumn(1)[start:end]))) wordpostmp.append('</s> ') wordpos = ' '.join(wordpostmp) feats.update(self.extractWithREs(self.DIRECT_FEATS,wordpos)) feats.update(self.extractWithREs(self.CALC_FEATS,wordpos)) feats.update(self.extractFromLemmatatizedForms(self.LEMMA_FEATS,lemma)) self.calculateFeats(feats) self.normalizeByLength(feats, len(lemma)) feats.update(self.extractStatistics(cqpf)) print feats return ir.getID(),feats
def main(): # Parse the arguments parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', type=str, metavar='config_file', help='The path for the config file', default='config.ini') args = parser.parse_args() # Read the config file options config = configparser.ConfigParser() config.read(args.config) # Initialize the classes api = APIAccess( config['LANGUAGE']['L2'], config['LANGUAGE']['L1'], config.getboolean('EXAMPLE_SENTENCES', 'PreferLongSentences'), config.getboolean('EXAMPLE_SENTENCES', 'Cloze')) input_reader = InputReader(config['INPUT']['Mode'], config['INPUT']['FileName']) output_writer = OutputWriter(config['OUTPUT']) audio = AudioManager(config['LANGUAGE']['L2'], config['AUDIO']['Folder'], config.getboolean('AUDIO', 'Normalize')) # Read the input file and get all the distinct meanings for each # word, then append them to the output file failed = [] for word in input_reader.get_next_word(): try: results = api.get_dict_info(word) # Try to find the audio file for the word and update the results audio_file = audio.get_audio(word) for result in results: result['Pronounciation'] = audio_file output_writer.write_output(results) except PermissionError as e: # Happens if the Lexicala API doesn't allow connection print(e) print('Terminating the Program') exit(0) except Exception as e: # Shouldn't happen normally raise e # Book keeping and printing if not len(results): failed.append(word) print(f'{word} finished with {len(results)} results') print(f'Failed: {failed}')
def process(self,file): feats = {} Extractor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) lengths = [end-start for (start,end,arg) in cqpf.getAnnotations("s")] print self.__featureNames feats = utils.getStats("SENT_LENGTH", lengths) return ir.getID(),feats