Ejemplo n.º 1
0
    def __init__(self):
        self.CONFIDENT = 0.2
        self.UNCONFIDENT = 0.1

        self.mongodb_client = MongoDBClient()
        self.parser = Parser()
        self.dependency_extractor = DependencyExtractor()
Ejemplo n.º 2
0
class Suggester:
    def __init__(self):
        self.mongodb_client = MongoDBClient()

    def _suggest_patterns(self, original_pattern, all_patterns, k=5):
        original_tokens = original_pattern.split(' ')
        similar_patterns = sorted(
            all_patterns,
            key=lambda pattern: edit_distance(
                original_tokens, pattern['_id']['norm_pattern'].split(' ')))
        return similar_patterns[:k]

    def _suggest_ngrams(self, ngram_tks, pair_candidates, k=3):
        similar_pairs = sorted(
            pair_candidates,
            key=lambda pair: edit_distance(ngram_tks, pair[0].split(' ')))
        return similar_pairs[:k]

    def _edit_ngram(self):
        pass

    def _edit_sentence(self):
        pass

    def process(self, query, k_patterns=5):
        '''{
            'key': tk.lemma_|tk.dep_,
            'norm_pattern': norm_ptn,
            'ngram': ngram
        }'''
        # pattern_count = self.mongodb_client.get_pattern_count(key, info["bef"])
        total_count = self.mongodb_client.get_total_counts(query['key'])
        top_k_patterns = self.mongodb_client.get_top_pattern_counts(
            query['key'], k_patterns)

        suggestions = []
        ngram_tks = query['ngram'].split(' ')
        for each in top_k_patterns:
            ngram_key = f'{query["key"]}|{each["norm_pattern"]}'
            ngrams = [(doc['ngram'].split('|')[0], doc['sent'])
                      for doc in self.mongodb_client.get_ngrams(ngram_key)]
            ngram_sggs = self._suggest_ngrams(ngram_tks, ngrams)

            try:
                percentage = each['count'] / total_count
            except ZeroDivisionError:
                percentage = 0

            suggestions.append({
                'norm_pattern': each['norm_pattern'],
                'percent': math.floor(percentage * 100),
                'ngrams': ngram_sggs
            })
        return suggestions
Ejemplo n.º 3
0
def main():
    parser = Parser()
    data_cleaner = DataCleaner()
    dependency_extractor = DependencyExtractor()
    mongo_client = MongoDBClient()
    mongo_client.create_indexes()
    # filename = 'bnc.parse.txt.gz'
    filename = 'coca.txt.gz'
    with gzip.open('/Users/whan/Data/' + filename, 'rt',
                   encoding='utf8') as fs:
        pattern_counter = Counter()
        ngram_set = set()
        for i, entry in enumerate(tqdm(fs), 1):
            # parsed_entry = ParsedEntry(entry)
            parsed_entry = parser.parse(entry.strip())
            # origin_sent = parsed_entry.origin_sent
            origin_sent = entry
            if data_cleaner.is_valid_data(parsed_entry, origin_sent):
                sent_score = round(dependency_extractor.score(parsed_entry), 2)
                if sent_score < 0.6:
                    continue

                for token in parsed_entry:
                    info = dependency_extractor.process(token)
                    if info:
                        key = f'{token.lemma_}|{token.dep_}'
                        pattern_counter[(key, info['norm_pattern'])] += 1
                        ngram_key = f'{key}|{info["norm_pattern"]}'
                        ngram = f'{info["ngram"]}|{info["pattern"]}'
                        sent = ' '.join([
                            f'<w>{tk.text}</w>'
                            if tk.i in info['indices'] else tk.text
                            for tk in parsed_entry
                        ])
                        ngram_set.add((ngram_key, ngram, sent, sent_score))

            if i % 50000 == 0:
                upload_to_db(mongo_client, pattern_counter, ngram_set)
        upload_to_db(mongo_client, pattern_counter, ngram_set)
Ejemplo n.º 4
0
class RawProcessor:

    def __init__(self, data_directory):
        self.data_directory = data_directory
        self.recommend = Recommend(Parser(), EGP(
            data_directory), EVP(data_directory))
        self.mongoDBClient = MongoDBClient()
        self.mongoDBClient.create_indexes()

    def calc_ngram_and_count(self, filename):
        with gzip.open(os.path.join(self.data_directory, filename), 'rt', encoding='utf8') as fs:
            local_cache_cnt = Counter()
            local_cache_sents = []
            for i, entry in enumerate(tqdm(fs), 1):
                parsed_entry = ParsedEntry(entry)

                matches, parsed_entry, full_sent_matches = self.recommend.match_patterns(
                    parsed_entry, is_parsed_content=True, return_full_sentence_matches=True)

                for match in matches:
                    key = (match['match'], match['rule_num'],
                           match['level'], match['ngram'])
                    local_cache_cnt[key] += 1

                # collect example sentences and only care for full sentences
                for match in full_sent_matches:
                    indices = set(match['indices'])
                    sentence = ' '.join(
                        '<w>' + tk.text + '</w>' if tk.i in indices else tk.text for tk in parsed_entry)
                    local_cache_sents.append(
                        {'ngram': match['ngram'], 'sentence': sentence})

                if i % 50000 == 0:
                    print(i, "Uploading to MongoDB.")

                    documents = [({
                        'match': match, 'rule_num': rule_num, 'level': level, 'ngram': ngram, 'tokens': ngram.split(' ')
                    }, count) for (match, rule_num, level, ngram), count in local_cache_cnt.items()]

                    print("Start to update ngram counts in bulk.")
                    start_time = datetime.datetime.now()
                    self.mongoDBClient.bulk_inc_ngram_count(documents)
                    local_cache_cnt = Counter()
                    end_time = datetime.datetime.now()
                    print("End update ngram counts in bulk with elapsed seconds: " +
                          str((end_time-start_time).total_seconds()))

                    print("Start to insert ngram sentences in bulk.")
                    start_time = datetime.datetime.now()
                    self.mongoDBClient.add_sentences(local_cache_sents)
                    local_cache_sents.clear()
                    end_time = datetime.datetime.now()
                    print("End insert ngram sentences in bulk with elapsed seconds: " +
                          str((end_time-start_time).total_seconds()))
Ejemplo n.º 5
0
class Corrector:
    def __init__(self):
        self.CONFIDENT = 0.2
        self.UNCONFIDENT = 0.1

        self.mongodb_client = MongoDBClient()
        self.parser = Parser()
        self.dependency_extractor = DependencyExtractor()

    def _categorize(self, ratio):
        if ratio > self.CONFIDENT:
            return 'right'
        elif ratio < self.UNCONFIDENT:
            return 'wrong'
        else:
            return 'not_sure'

    def _get_template(self, ratio):
        if ratio > self.CONFIDENT:
            return '{{+{}//{}+}}'
        elif ratio < self.UNCONFIDENT:
            return '[-{}//{}-]'
        else:
            return '\\*{}//{}*\\'

    def process(self, sent):
        sent = self.parser.parse(sent)  # TODO: normalize input?

        edits, meta = [], {}
        for token in sent:
            info = self.dependency_extractor.process(token)
            if info:
                key = f'{token.lemma_}|{token.dep_}'
                ratio = self.mongodb_client.get_pattern_ratio(
                    key, info['norm_pattern'])
                edits.append(
                    self._get_template(ratio).format(token.text, token.i))
                meta[str(token.i)] = {
                    'key': key,
                    'norm_pattern': info['norm_pattern'],
                    'ngram': info['ngram']
                }
            else:
                edits.append(token.text)

        return (' '.join(edits), meta)
Ejemplo n.º 6
0
 def __init__(self):
     self.mongodb_client = MongoDBClient()
Ejemplo n.º 7
0
 def __init__(self, data_directory):
     self.data_directory = data_directory
     self.recommend = Recommend(Parser(), EGP(
         data_directory), EVP(data_directory))
     self.mongoDBClient = MongoDBClient()
     self.mongoDBClient.create_indexes()
Ejemplo n.º 8
0
 def __init__(self, parser, egp, evp):
     self.mongoDBClient = MongoDBClient()
     self.re_token = re.compile('\w+|[,.:;!?]')
     self.parser = parser
     self.egp = egp
     self.evp = evp