def __init__(self): self.CONFIDENT = 0.2 self.UNCONFIDENT = 0.1 self.mongodb_client = MongoDBClient() self.parser = Parser() self.dependency_extractor = DependencyExtractor()
class Suggester: def __init__(self): self.mongodb_client = MongoDBClient() def _suggest_patterns(self, original_pattern, all_patterns, k=5): original_tokens = original_pattern.split(' ') similar_patterns = sorted( all_patterns, key=lambda pattern: edit_distance( original_tokens, pattern['_id']['norm_pattern'].split(' '))) return similar_patterns[:k] def _suggest_ngrams(self, ngram_tks, pair_candidates, k=3): similar_pairs = sorted( pair_candidates, key=lambda pair: edit_distance(ngram_tks, pair[0].split(' '))) return similar_pairs[:k] def _edit_ngram(self): pass def _edit_sentence(self): pass def process(self, query, k_patterns=5): '''{ 'key': tk.lemma_|tk.dep_, 'norm_pattern': norm_ptn, 'ngram': ngram }''' # pattern_count = self.mongodb_client.get_pattern_count(key, info["bef"]) total_count = self.mongodb_client.get_total_counts(query['key']) top_k_patterns = self.mongodb_client.get_top_pattern_counts( query['key'], k_patterns) suggestions = [] ngram_tks = query['ngram'].split(' ') for each in top_k_patterns: ngram_key = f'{query["key"]}|{each["norm_pattern"]}' ngrams = [(doc['ngram'].split('|')[0], doc['sent']) for doc in self.mongodb_client.get_ngrams(ngram_key)] ngram_sggs = self._suggest_ngrams(ngram_tks, ngrams) try: percentage = each['count'] / total_count except ZeroDivisionError: percentage = 0 suggestions.append({ 'norm_pattern': each['norm_pattern'], 'percent': math.floor(percentage * 100), 'ngrams': ngram_sggs }) return suggestions
def main(): parser = Parser() data_cleaner = DataCleaner() dependency_extractor = DependencyExtractor() mongo_client = MongoDBClient() mongo_client.create_indexes() # filename = 'bnc.parse.txt.gz' filename = 'coca.txt.gz' with gzip.open('/Users/whan/Data/' + filename, 'rt', encoding='utf8') as fs: pattern_counter = Counter() ngram_set = set() for i, entry in enumerate(tqdm(fs), 1): # parsed_entry = ParsedEntry(entry) parsed_entry = parser.parse(entry.strip()) # origin_sent = parsed_entry.origin_sent origin_sent = entry if data_cleaner.is_valid_data(parsed_entry, origin_sent): sent_score = round(dependency_extractor.score(parsed_entry), 2) if sent_score < 0.6: continue for token in parsed_entry: info = dependency_extractor.process(token) if info: key = f'{token.lemma_}|{token.dep_}' pattern_counter[(key, info['norm_pattern'])] += 1 ngram_key = f'{key}|{info["norm_pattern"]}' ngram = f'{info["ngram"]}|{info["pattern"]}' sent = ' '.join([ f'<w>{tk.text}</w>' if tk.i in info['indices'] else tk.text for tk in parsed_entry ]) ngram_set.add((ngram_key, ngram, sent, sent_score)) if i % 50000 == 0: upload_to_db(mongo_client, pattern_counter, ngram_set) upload_to_db(mongo_client, pattern_counter, ngram_set)
class RawProcessor: def __init__(self, data_directory): self.data_directory = data_directory self.recommend = Recommend(Parser(), EGP( data_directory), EVP(data_directory)) self.mongoDBClient = MongoDBClient() self.mongoDBClient.create_indexes() def calc_ngram_and_count(self, filename): with gzip.open(os.path.join(self.data_directory, filename), 'rt', encoding='utf8') as fs: local_cache_cnt = Counter() local_cache_sents = [] for i, entry in enumerate(tqdm(fs), 1): parsed_entry = ParsedEntry(entry) matches, parsed_entry, full_sent_matches = self.recommend.match_patterns( parsed_entry, is_parsed_content=True, return_full_sentence_matches=True) for match in matches: key = (match['match'], match['rule_num'], match['level'], match['ngram']) local_cache_cnt[key] += 1 # collect example sentences and only care for full sentences for match in full_sent_matches: indices = set(match['indices']) sentence = ' '.join( '<w>' + tk.text + '</w>' if tk.i in indices else tk.text for tk in parsed_entry) local_cache_sents.append( {'ngram': match['ngram'], 'sentence': sentence}) if i % 50000 == 0: print(i, "Uploading to MongoDB.") documents = [({ 'match': match, 'rule_num': rule_num, 'level': level, 'ngram': ngram, 'tokens': ngram.split(' ') }, count) for (match, rule_num, level, ngram), count in local_cache_cnt.items()] print("Start to update ngram counts in bulk.") start_time = datetime.datetime.now() self.mongoDBClient.bulk_inc_ngram_count(documents) local_cache_cnt = Counter() end_time = datetime.datetime.now() print("End update ngram counts in bulk with elapsed seconds: " + str((end_time-start_time).total_seconds())) print("Start to insert ngram sentences in bulk.") start_time = datetime.datetime.now() self.mongoDBClient.add_sentences(local_cache_sents) local_cache_sents.clear() end_time = datetime.datetime.now() print("End insert ngram sentences in bulk with elapsed seconds: " + str((end_time-start_time).total_seconds()))
class Corrector: def __init__(self): self.CONFIDENT = 0.2 self.UNCONFIDENT = 0.1 self.mongodb_client = MongoDBClient() self.parser = Parser() self.dependency_extractor = DependencyExtractor() def _categorize(self, ratio): if ratio > self.CONFIDENT: return 'right' elif ratio < self.UNCONFIDENT: return 'wrong' else: return 'not_sure' def _get_template(self, ratio): if ratio > self.CONFIDENT: return '{{+{}//{}+}}' elif ratio < self.UNCONFIDENT: return '[-{}//{}-]' else: return '\\*{}//{}*\\' def process(self, sent): sent = self.parser.parse(sent) # TODO: normalize input? edits, meta = [], {} for token in sent: info = self.dependency_extractor.process(token) if info: key = f'{token.lemma_}|{token.dep_}' ratio = self.mongodb_client.get_pattern_ratio( key, info['norm_pattern']) edits.append( self._get_template(ratio).format(token.text, token.i)) meta[str(token.i)] = { 'key': key, 'norm_pattern': info['norm_pattern'], 'ngram': info['ngram'] } else: edits.append(token.text) return (' '.join(edits), meta)
def __init__(self): self.mongodb_client = MongoDBClient()
def __init__(self, data_directory): self.data_directory = data_directory self.recommend = Recommend(Parser(), EGP( data_directory), EVP(data_directory)) self.mongoDBClient = MongoDBClient() self.mongoDBClient.create_indexes()
def __init__(self, parser, egp, evp): self.mongoDBClient = MongoDBClient() self.re_token = re.compile('\w+|[,.:;!?]') self.parser = parser self.egp = egp self.evp = evp