Ejemplo n.º 1
0
def _declareStuff():
    try:
        shutil.rmtree('./app/collection')
    except:
        print('Creating collection folder')
    os.mkdir('./app/collection')

    try:
        shutil.rmtree('./app/index')
    except:
        print('Creating index folder')
    os.mkdir('./app/index')

    global aux
    aux = {}

    global sndx
    sndx = {}

    global prfx
    prfx = Trie()

    global psfx
    psfx = Trie()

    global docs
    docs = get_docs("./app/docs/reuters21578.tar.gz")

    global length
    length = get_docs_length("./app/docs/reuters21578.tar.gz")

    global docs_names
    docs_names = names_generator(length)
Ejemplo n.º 2
0
def test_trie():
    t = Trie(['he','hers','his','she'])
    results = t.match('a his hoge hershe xx.')
    assert len(results) == 5
    rows = [(2,3), (11,2), (11,4), (14,3), (15,2)]
    for i in xrange(5):
        print results[i], rows[i]
        assert results[i] == rows[i]
Ejemplo n.º 3
0
def test_trie():
    t = Trie(['he', 'hers', 'his', 'she'])
    results = t.match('a his hoge hershe xx.')
    assert len(results) == 5
    rows = [(2, 3), (11, 2), (11, 4), (14, 3), (15, 2)]
    for i in xrange(5):
        print results[i], rows[i]
        assert results[i] == rows[i]
Ejemplo n.º 4
0
class TestTrie(unittest.TestCase):

    def setUp(self):
        self.corpus = ['black',
                       'blue',
                       'orange',
                       'orangered',
                       'green',
                       'aqua marine']
        self.fixture = Trie()
        for i in self.corpus:
            self.fixture[i] = True

    def test_len(self):
        assert len(self.fixture) == len(self.corpus)

    def test_path_exists(self):
        for i in self.corpus:
            assert self.fixture._path_exists(i)

        assert not self.fixture._path_exists('not a color')

    def test_delitem(self):
        key = self.corpus[0]
        assert self.fixture[key] == True
        del self.fixture[key]
        self.assertRaises(KeyError, self.fixture.__getitem__, key)

        assert not self.fixture._path_exists(key[:-1])

    def test_getitem(self):
        for i in self.corpus:
            assert self.fixture[i] == True
        self.assertRaises(KeyError, self.fixture.__getitem__, 'not a color')
    
    def test_setitem(self):
        self.assertRaises(KeyError, self.fixture.__setitem__, '', None)

    def test_contains(self):
        for i in self.corpus:
            assert i in self.fixture

        assert 'not a color' not in self.fixture

    def test_iterator(self):
        collected_keys = set([k for k in self.fixture])
        assert collected_keys == set(self.corpus)

        for k in self.fixture:
            assert self.fixture[k] == True

    def test_iterkeys(self):
        keys = set()
        for i in self.fixture.iterkeys():
            keys.add(i)
        assert keys == set(self.corpus)
Ejemplo n.º 5
0
    def __init__(self, *args, **kwargs):
        self._dict = {}
        self._trie = Trie(*args, **kwargs)

        d = dict(*args, **kwargs)
        for key, value in d.items():
            self._dict[case_insensitive(key)] = value
Ejemplo n.º 6
0
def split_file(
    in_file,
    out_file,
    fields="bgg_user_name",
    trie_file=None,
    limits=LIMIT,
    construct=False,
):
    """ split input file along prefixes """

    trie = None

    if trie_file and not construct:
        LOGGER.info("loading trie from file <%s>...", trie_file)
        trie = _trie_from_file(trie_file)

    if not trie:
        LOGGER.info("making trie for <%s>...", in_file)
        full_trie = _make_trie(file=in_file, fields=fields)
        limits = tuple(arg_to_iter(limits)) or (LIMIT, )

        for limit in limits:
            trie = Trie(_prefixes(full_trie, limit=limit))
            LOGGER.info("%d prefixes using limit %d", len(trie), limit)
            out_path = trie_file.format(limit=limit) if trie_file else None
            if not out_path or out_path == "-":
                for prefix, count in trie.items():
                    print(f"{prefix}\t{count}")
            else:
                with open(out_path, "w") as file_obj:
                    for prefix, count in trie.items():
                        file_obj.write(f"{prefix}\t{count}\n")

    LOGGER.info("constructed trie of size %d", len(trie))
    _save_to_prefixes(dst=out_file, trie=trie, file=in_file, fields=fields)
Ejemplo n.º 7
0
    def __init__(self, *args, **kwargs):
        # pylint: disable=super-init-not-called
        self._dict = {}
        self._trie = Trie(*args, **kwargs)

        d = dict(*args, **kwargs)
        for key, value in d.iteritems():
            self._dict[case_insensitive(key)] = value
Ejemplo n.º 8
0
 def setUp(self):
     self.corpus = ['black',
                    'blue',
                    'orange',
                    'orangered',
                    'green',
                    'aqua marine']
     self.fixture = Trie()
     for i in self.corpus:
         self.fixture[i] = True
Ejemplo n.º 9
0
    def _load_data(file_path):
        """
        Temporary Proof of concept function - Loads information from CSV file
        Parses and splits CORP_NME for search_trie
        :param file_path: File path of CSV
        :return: None
        """
        import csv

        # TODO: Move field constants elsewhere
        index_field = 'CORP_NUM'
        end_event_field = 'END_EVENT_ID'
        name_field = 'CORP_NME'

        Search.__search_trie = Trie()
        Search.__cached_name = dict()
        if not os.path.isfile(file_path):
            logger.warning('File not found. Empty search trie instantiated')
            return

        with open(file_path) as file:
            reader = csv.DictReader(file,
                                    delimiter=';',
                                    quoting=csv.QUOTE_NONE)
            try:
                for row in reader:
                    # Ignore columns with specified END_EVENT_ID
                    if row[end_event_field] in (None, ''):
                        continue

                    # Build Cache Dictionary
                    if row[index_field] not in Search.__cached_name:
                        Search.__cached_name[row[index_field]] = \
                            [row[name_field]]
                    else:
                        Search.__cached_name[row[index_field]].append(
                            row[name_field])

                    # Build Search Trie
                    # Remove non-alphanumeric characters and split words
                    clean_name = Search._clean_string(row[name_field])
                    for word in clean_name.split():
                        if word not in (None, ''):
                            # Create all possible suffixes of word
                            suffix_list = \
                                [(yield(word[i:])) for i in range(len(word))]
                            for suffix in suffix_list:
                                if suffix not in Search.__search_trie:
                                    Search.__search_trie[suffix] = set()
                                Search.__search_trie[suffix].add(
                                    row[index_field])

            except UnicodeDecodeError:
                logger.error('Unexpected input at line %s', reader.line_num)
        logger.info('Loaded and indexed data')
Ejemplo n.º 10
0
def load_data():
    global data_loaded, prefix_tree, data, country_index, name_index
    json_data = open("world_universities_and_domains.json").read()
    data = json.loads(json_data)
    for i in data:
        country_index[i["country"].lower()].append(i)
        name_index[i['name'].lower()] = i
        splitted = i['name'].split(" ")
        if len(splitted) > 1:
            for splitted_name in splitted[1:]:
                name_index[splitted_name.lower() + str(uuid.uuid1())] = i
    prefix_tree = Trie(**name_index)

    data_loaded = True
Ejemplo n.º 11
0
def load_data():
    global data_loaded, prefix_tree, data, country_index, name_index, domain_index
    response = requests.get("https://raw.githubusercontent.com/Hipo/university-domains-list/master/world_universities_and_domains.json")
    data = response.json()
    for i in data:
        country_index[i["country"].lower()].append(i)
        name_index[i['name'].lower()] = i
        for domain in i["domains"]:
            domain_index[domain].append(i)
        splitted = i['name'].split(" ")
        if len(splitted) > 1:
            for splitted_name in splitted[1:]:
                name_index[splitted_name.lower() + str(uuid.uuid1())] = i
    prefix_tree = Trie(**name_index)

    data_loaded = True
Ejemplo n.º 12
0
def context_entities(setname, domain_set):
    for domain in domain_set:
        with open(
                JSON_OUTPUT_DIR +
                '/mentions/{}/{}.json'.format(setname, domain), 'r') as f:
            # add entities to this trie
            trie = Trie()
            for ent_id in domain2entities[domain]:
                # 不考虑multi category的情况
                trie.setdefault(entities2name[ent_id].lower(), 0)
                # 考虑multi category的情况
                # trie.setdefault(entities2alias[ent_id], 0)
            total_alias_count = 0
            total_doc_count = 0
            matched_text_list = []
            for line in tqdm(f, desc='processing {}...'.format(domain)):
                datum = json.loads(line.strip())
                text = decode(datum['mention_context_tokens'])
                total_doc_count += 1
                i = 0
                matched_text = ''
                while i < len(text):
                    item = trie.longest_prefix_item(text[i:], default=None)
                    if item is not None:
                        prefix, key_id = item
                        if (i == 0 or text[i-1] == ' ') \
                                and (i+len(prefix) == len(text) or text[i+len(prefix)] == ' '):
                            total_alias_count += 1
                            i += len(prefix)
                            matched_text += '##' + prefix + '##'
                        else:
                            matched_text += prefix
                            i += len(prefix)
                    else:
                        matched_text += text[i]
                        i += 1
                matched_text_list.append(matched_text)
            print('Avg alias count {:.2f} in {} documents.'.format(
                total_alias_count / total_doc_count, domain))
        with open(
                JSON_OUTPUT_DIR +
                '/mentions/{}/{}_matched.json'.format(setname, domain),
                'w') as f:
            for text in matched_text_list:
                f.write(text + '\n\n')
Ejemplo n.º 13
0
 def createTrie(self, filePath):
     trie = Trie()
     # with open("/Users/nali/Downloads/final_dict.txt", 'r') as file:
     with open(filePath, 'r') as file:
         lines = file.readlines()
         flag = 0
         for line in lines:
             flag += 1
             parts = line.strip().split('\t')
             if (len(parts) > 1):
                 words = parts[1].split(';')
                 for word in words:
                     if (word != ''):
                         if (trie.get(word) == None):
                             trie[word] = set()
                             trie[word].add(parts[0])
                         else:
                             trie[word].add(parts[0])
     return trie
Ejemplo n.º 14
0
 def __init__(self):
     self.full_match = Trie()
     self.repairs_match = Trie()
     self.full_sentences = {}
Ejemplo n.º 15
0
def _make_trie(file, fields="bgg_user_name", sep=","):
    return Trie(_process_file(file, fields, sep, count=True))
Ejemplo n.º 16
0
def _trie_from_file(file):
    try:
        return Trie(_prefixes_from_file(file))
    except Exception:
        pass
    return None
Ejemplo n.º 17
0
            word2soundex[sndx] = []
        if word not in word2soundex[sndx]:
            word2soundex[sndx].append(word)

        # Prefix tree loader
        if not word in prefix_tree:
            prefix_tree[word] = None

        rword = word[::-1]
        # Postfix_tree loader
        if not rword in postfix_tree:
            postfix_tree[rword] = None

    return aux, word2soundex, prefix_tree, postfix_tree


if __name__ == '__main__':
    aux = {'one': [0], 'five': [0], 'reuters': [0]}
    w2s = {'O500': ['one'], 'F100': ['five'], 'R362': ['reuters']}
    prfx = Trie(one=None, five=None, reuters=None, chelny=None)
    psfx = Trie(eon=None, evif=None, sretuer=None, ylehc=None)
    doc = next(get_docs("./docs/reuters21578.tar.gz"))
    name = 'collection 1'

    aux, w2s, prfx, psfx = load_data(doc, name, aux, w2s, prfx, psfx)

    print(aux)
    print(w2s)
    print(prfx)
    print(psfx)
Ejemplo n.º 18
0
            reduce(lambda i, j: i & j, (set(x) for x in res)))
    except:
        message += 'NOTHING FOUND BY YOUR QUERY\n'

    message += '-----------------------------------------------------------\n'
    message += f'InnoGruk found {len(relevant_documents)} documents by your query\n'
    message += '-----------------------------------------------------------'

    return message, relevant_documents, track[2:]


if __name__ == '__main__':
    prfx = Trie(apple=None,
                borrow=None,
                friend=None,
                ginger=None,
                lermontov=None,
                money=None,
                november=None,
                object=None)

    psfx = Trie(elppa=None,
                worrob=None,
                dneirf=None,
                regnig=None,
                votnomrel=None,
                yenom=None,
                rebmevon=None,
                tcejbo=None)

    sndx = {
        'A140': ['apple'],
Ejemplo n.º 19
0
import os
from pytrie import StringTrie as Trie
import re
myTrie = Trie()  # create empty trie FOR QUERY2
myTrie1 = Trie()  #create empty trie FOR QUERY1
path = input('Hello, \nFirstly,Enter a path that includes txt files : ')
all_files = os.listdir(path)

#BUİLD TRİE FOR QUERY 2
for i in range(len(all_files)):
    with open(all_files[i], 'r') as f:
        for line in f:
            res = re.findall(r'\w+', line)
            #take list of words line by line ,convert lower and put with filename to trie
            for j in range(len(res)):
                #check the key existence in the trie
                if not myTrie.has_key(
                        res[j].lower()):  #  word is NOT in the trie
                    myTrie[res[j].lower()] = {all_files[i]
                                              }  # word-> set of file
                else:
                    myTrie.get(res[j].lower()).add(
                        all_files[i]
                    )  # if word is IN the trie add only file info to fileSET OF WORD

#BUİLD TRİE FOR QUERY 1
for i in range(len(all_files)):
    with open(all_files[i], 'r') as f:
        counter = 0
        for line in f:
            res = re.findall(r'\w+', line)
Ejemplo n.º 20
0
def loadtree(arr):
    global trie
    trie = Trie()
    for key in arr:
        trie[key[::-1]] = key
Ejemplo n.º 21
0
def generate_weight_tree(list, func):
    map = {}
    for w in list:
        w = w
        map[w] = func(w)
    return Trie(map)
Ejemplo n.º 22
0
from flask import jsonify
from pytrie import SortedStringTrie as Trie
import pandas as pd
import time
import json
import cPickle as pickle

trieds = Trie()  #trieds is short for trie datastructure.
listofallnames = []


def loadstrings():
    """
    Function to load a csv file, preprocess the data and load it into the Trie Datastructure
    """
    start = time.time()
    df = pd.read_csv("data.csv", error_bad_lines=False
                     )  #loading csv data file as a dataframe using pandas
    listofallnames = {}
    print 'key generation in process, please wait.'
    count = 0
    for index, row in df.iterrows():
        rowdata = {}
        #preprocessing data in the dataframe
        rowdata['firstname'] = str(row['givenName']).lower()
        rowdata['middlename'] = str(row['middleName']).lower()
        rowdata['lastname'] = str(row['surname']).lower()
        if str(row['givenName']) != 'nan':
            try:
                listofallnames[str(rowdata['firstname'])].append(rowdata)
            except KeyError:
Ejemplo n.º 23
0
    with open('candidates.txt','r') as candidates:
        data = candidates.readlines()
        for key in data:
            output.append(key.split("\n")[0])
    return output

onepercent = []
# print(getelse("hello",2))
# judgeoutput = dict()
#
# alpha = "abcdefghijklmnopqrstuvwxyz"
print("Loading Dictionary")
dictionary = loaddictionary()
dictionaryprime = loadprimedictionary()
print("Building Tree Begin",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
t = Trie(dictionary)
tprime = Trie(dictionaryprime)
print("Building Tree Stop",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
print("Loading Candidates")
candidates = loadcandidates()
judgeoutput = dict()
i = 0
j = 0
print("Begin Time:",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
for key in candidates:
    if i % 170 == 0:
        print(i/170,"%")
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    judgeoutput[key] = judge(key,j)
    i = i + 1
    j = j + 1
Ejemplo n.º 24
0
def solution():

    candidatesTokens = []
    blendResults = []
    blendWords = []
    prefixTrie = Trie()
    reversalTrie = Trie()

    def equalLetter(replace, match, char1, char2):
        if char1 == char2:
            return match
        else:
            return replace

    def split(word):
        leng = len(word)
        prefix = word[:int(leng / 2) + 1]
        suffix = word[int(leng / 2 - 1):][::-1]
        splited = [prefix, suffix]
        return splited

    def calPreRe():
        truePositiveAmount = 0

        blendCorrectWords = []
        blendFalseWords = []

        for word in blendWords:
            if word in blendResults:
                truePositiveAmount += 1
                blendCorrectWords.append(word)

        print("  truePositive : ")
        print(truePositiveAmount)
        print("\n\n ")

        # precision
        recall = float(truePositiveAmount) / (len(blendResults) - 32)
        precision = float(truePositiveAmount) / len(blendWords)

        print(" Recall is : ")
        print(recall)
        print("\n\n")
        print(" Precision is : ")
        print(precision)
        print("\n\n")

        for word in blendResults:
            if word not in blendWords:
                blendFalseWords.append(word)

        print("count of  blendCorrectWords : " + len(blendCorrectWords))

        print("   blendCorrectWords : " + blendCorrectWords)

        print("  count of  blendFalseWords : " + len(blendFalseWords))

        print(" blendFalseWords : " + blendFalseWords)

    def input():
        file = open("data/dict.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()

        repeatLetters = [
            'aaaa', 'bbbb', 'cccc', 'dddd', 'eeee', 'ffff', 'gggg', 'hhhh',
            'iiii', 'jjjj', 'kkkk', 'llll', 'mmmm', 'nnnn', 'oooo', 'pppp',
            'qqqq', 'rrrr', 'ssss', 'tttt', 'uuuu', 'vvvv', 'wwww', 'xxxx',
            'yyy', 'zzzz'
        ]

        # input prefixly
        for word in wordList:
            word = word.strip()
            word = word.lower()
            prefixTrie[word] = word
            reverseWord = word[::-1]
            reversalTrie[reverseWord] = reverseWord

        file1 = open("data/candidates.txt", "r", encoding='utf-8')
        wordList1 = file1.read().splitlines()
        for word in wordList1:
            word = word.strip()
            word = word.lower()
            flag = 0
            for subString in repeatLetters:
                if subString in word:
                    flag = 1
                    break
            if flag == 0:
                candidatesTokens.append(word)

        file2 = open("data/blends.txt", "r", encoding='utf-8')
        wordList2 = file2.read().splitlines()

        for word in wordList2:
            word = word.split()[0]
            word = word.lower()
            blendResults.append(word)

    def editDistance(word1, word2):
        A = [[]]
        replace = -1
        match = 1
        deletion = -1
        insertion = -1

        word1Length = len(word1)
        word2Length = len(word2)
        A = np.zeros([word1Length, word1Length])
        # construct matrix
        for j in range(word1Length):
            for k in range(word1Length):
                A[j][k] = max(
                    0, A[j][k - 1] + deletion, A[j - 1][k] + insertion,
                    A[j - 1][k - 1] +
                    equalLetter(replace, match, word1[k - 1], word2[j - 1]))

        max_item = max(max(row) for row in A)
        return max_item

    def compareUsingLED(pref, reverse=0):
        if reverse:
            trieList = reversalTrie
        else:
            trieList = prefixTrie
        prefixDict = trieList.keys(prefix=pref)
        threshold = len(pref) * (0.85)

        prefixFlag = 0
        for word in prefixDict:
            LEDValue = editDistance(pref, word)
            if LEDValue >= threshold:
                JWValue = jarowinklerSimilarity.get_jaro_distance(
                    pref, word[:int(len(pref))], winkler=True, scaling=0.1)
                if JWValue > 0.8:
                    return True
        return False

    # input
    input()

    for word in candidatesTokens:
        prefix, reverseSuffix = split(word)
        if compareUsingLED(prefix, 0) and compareUsingLED(reverseSuffix, 1):
            blendWords.append(word)

    calPreRe()
Ejemplo n.º 25
0
def create_training_instances(input_file, max_seq_length, tokenizer, rng,
                              alias2entities):
    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    """Create `TrainingInstance`s from raw text."""
    all_documents = []
    all_alias_token_spans = []
    from pytrie import SortedStringTrie as Trie
    trie = Trie()
    # add entities to this trie
    for alias, ents in alias2entities.items():
        trie.setdefault(alias, 0)

    with open(input_file, "r") as reader:
        for line in tqdm(reader, desc='converting tokens'):
            line = tokenization.convert_to_unicode(line.strip())
            line = json.loads(line)['text']

            tokens = []
            if do_lower_case:
                line = line.lower()
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in line:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        tokens.append(c)
                    else:
                        tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(tokens) - 1)

            # 匹配文档中的alias
            alias_spans = match_alias(line, trie, alias2entities)
            # 此时的span对应粗粒度的token,span_end对应alias的最后一个token
            alias_token_spans = [(char_to_word_offset[span[0]],
                                  char_to_word_offset[span[1] - 1])
                                 for span in alias_spans]

            for span, token_span in zip(alias_spans, alias_token_spans):
                alias_tokens = ' '.join(tokens[token_span[0]:token_span[1] +
                                               1])
                alias_texts = line[span[0]:span[1]]
                assert alias_tokens in alias2entities, print(
                    alias_tokens, token_span, alias_texts, span)
            # assert all(' '.join(tokens[span[0]: span[1] + 1]) in alias2entities for span in alias_token_spans), \
            #     print([' '.join(tokens[span[0]: span[1] + 1]) for span in alias_token_spans])

            tok_to_orig_index = []  # 细粒度-粗粒度
            orig_to_tok_index = []  # 粗粒度-细粒度
            real_tokens = []
            for (i, token) in enumerate(tokens):
                orig_to_tok_index.append(len(real_tokens))
                sub_tokens = tokenizer.tokenize(token)
                for sub_token in sub_tokens:
                    tok_to_orig_index.append(i)
                    real_tokens.append(sub_token)
            # 判断当前span对应的粗粒度token是否为最后一个token,
            # 如果是的话,则取最后一个细粒度token为结尾,如果不是的话,取下一个粗粒度token对应的第一个细粒度token的前一个token为结尾。
            real_alias_token_spans = []
            for span in alias_token_spans:
                if span[1] == len(tokens) - 1:
                    real_end = orig_to_tok_index[-1]
                else:
                    real_end = orig_to_tok_index[span[1] + 1] - 1
                real_start = orig_to_tok_index[span[0]]
                real_alias_token_spans.append((real_start, real_end))

            # alias_token_spans = [(orig_to_tok_index[span[0]], orig_to_tok_index[span[1]])
            #                      for span in alias_token_spans]

            all_documents.append(real_tokens)
            all_alias_token_spans.append(real_alias_token_spans)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for document_index in tqdm(range(len(all_documents)),
                               total=len(all_documents),
                               desc='creating instances'):
        instances.extend(
            create_instances_from_document(all_documents, document_index,
                                           all_alias_token_spans,
                                           max_seq_length, vocab_words, rng))

    rng.shuffle(instances)
    return instances
Ejemplo n.º 26
0
t0 = time.time()  #Start time
for index, row in train.iterrows():
    #Normalise the names of dishes.

    with_sc = row[1]

    processed_string = preprocessing(with_sc)

    for token in processed_string.split():
        #Create Inverted Index
        if inverted_index.get(token) == None:
            inverted_index.update({token: set()})
        inverted_index[token].add(row[0])

prefix_tree = Trie(**inverted_index)

t1 = time.time() - t0  #Time required for index construction

print("Time taken for Trie generation: {} ".format(t1))


def search_results(query):
    #Return primary keys for query matching strings

    t0 = time.time()  #Start time

    found = False

    processed_query = preprocessing(
        query)  #Apply same pre-processing as before
Ejemplo n.º 27
0
def solution():

    trie = Trie()
    reverseTrie = Trie()
    candidatesList = []
    blendList = []
    blendAnswerList = []
    repeatSubString = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee', 'ffff', 'gggg', 'hhhh', 'iiii', 'jjjj', 'kkkk', 'llll',
                       'mmmm', 'nnnn', 'oooo', 'pppp', 'qqqq', 'rrrr', 'ssss', 'tttt', 'uuuu', 'vvvv', 'wwww', 'xxxx',
                       'yyy', 'zzzz']

    def inputTrie():
        file = open("data/dict.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()

        # input prefixly
        for word in wordList:
            word = word.strip()
            word = word.lower()
            trie[word] = word
            reverseWord = word[::-1]
            reverseTrie[reverseWord] = reverseWord

    def inputCandidate():
        file = open("data/candidates.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()
        for word in wordList:
            word = word.strip()
            word = word.lower()
            flag = 0
            for subString in repeatSubString:
                if subString in word:
                    flag = 1
                    break
            if flag == 0:
                candidatesList.append(word)


    def inputBlendAnswerList():
        file = open("data/blends.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()

        for word in wordList:
            word = word.split()[0]
            word = word.lower()
            blendAnswerList.append(word)

    def splitWord(word):
        length = len(word)
        prefix = word[:int(length/2) + 1]
        reverseSuffix = word[int(length/2 - 1) :][::-1]
        splitedCandidate = [prefix, reverseSuffix]
        return splitedCandidate

    def equal(replace, match, char1, char2):
        if char1 == char2:
            return match
        else:
            return replace

    def localEditDistance(word1,word2):
        A = [[]]
        deletion = -1
        insertion = -1
        replace = -1
        match = 1


        word1Length = len(word1)
        word2Length = len(word2)
        A = np.zeros([word1Length, word1Length])
        # construct matrix
        for j in range(word1Length):
            for k in range(word1Length):
                A[j][k] = max(0, A[j][k-1] + deletion, A[j-1][k] + insertion, A[j-1][k-1] + equal(replace, match, word1[k-1],word2[j-1]))

        max_item = max(max(row) for row in A)
        return max_item

    def comparePrefixNDictUsingLED(pref, reverse = 0):
        if reverse:
            trieList = reverseTrie
        else:
            trieList = trie
        prefixDict = trieList.keys(prefix = pref)
        threshold = len(pref) * (0.85)

        prefixFlag = 0
        for word in prefixDict:
            LEDValue = localEditDistance(pref, word)
            if LEDValue >= threshold:
                JWValue = jarowinklerSimilarity.get_jaro_distance(pref, word[:int(len(pref))], winkler = True, scaling = 0.1)
                if JWValue > 0.8:
                    return True
        return False

    def calAccurancy():
        truePositiveAmount = 0

        print(" ***********  blendlist : ")
        print(blendList)
        print("\n\n ")
        print(" *********** count blendlist : ")
        print(len(blendList))
        print("\n\n ")
        blendFindedList = []
        blendMissedList = []

        for word in blendList:
            if word in blendAnswerList:
                truePositiveAmount += 1
                blendFindedList.append(word)

        print(" ***********  truePositive : ")
        print(truePositiveAmount)
        print("\n\n ")

        # precision
        precision = float(truePositiveAmount) / len(blendList)
        recall = float(truePositiveAmount) / (len(blendAnswerList) - 32)

        print("***** Precision is : ")
        print(precision)
        print("\n\n")
        print("***** Recall is : ")
        print(recall)
        print("\n\n")

        for word in blendAnswerList:
            if word not in blendList:
                blendMissedList.append(word)

        print(" *********** count of  blendFindedList : ")
        print(len(blendFindedList))
        print(" ***********  blendFindedList : ")
        print(blendFindedList)
        print("\n\n ")
        print(" *********** count of  blendMissedList : ")
        print(len(blendMissedList))
        print(" ***********  blendMissedList : ")
        print(blendMissedList)
        print("\n\n ")


    # input
    inputTrie()
    inputCandidate()
    inputBlendAnswerList()

    # append filtered candidates to blendList
    for word in candidatesList:
        prefix, reverseSuffix = splitWord(word)
        if comparePrefixNDictUsingLED(prefix, 0) and comparePrefixNDictUsingLED(reverseSuffix, 1):
            blendList.append(word)

    calAccurancy()
Ejemplo n.º 28
0
  numeros['treinti{}'.format(text)] = 30 + number
  numeros['treinta y {}'.format(text)] = 30 + number
  numeros['cuarenti{}'.format(text)] = 40 + number
  numeros['cuarenta y {}'.format(text)] = 40 + number
  numeros['cincuenti{}'.format(text)] = 50 + number
  numeros['cincuenta y {}'.format(text)] = 50 + number
  numeros['sesenti{}'.format(text)] = 60 + number
  numeros['sesenta y {}'.format(text)] = 60 + number
  numeros['setenti{}'.format(text)] = 70 + number
  numeros['setenta y {}'.format(text)] = 70 + number
  numeros['ochenti{}'.format(text)] = 80 + number
  numeros['ochenta y {}'.format(text)] = 80 + number
  numeros['noventi{}'.format(text)] = 90 + number
  numeros['noventa y {}'.format(text)] = 90 + number

numeros = Trie(numeros)

multiplicadores = Trie({
  'mil': 1000,
  'ientos': 100,
  'cientos': 100,
  'ciento': 100,
  'cientas': 100,
  'cienta': 100,
  'cien': 100
})

momento_del_dia = Trie({ 'de la noche': 'pm', 'de la mañana': 'am', 'del mediodia': 'pm', 'de la tarde': 'pm' })

time_of_day_to_sum = {
  'noche': 12,
Ejemplo n.º 29
0
def solution():

    trie = Trie()
    reverseTrie = Trie()
    candidatesList = []
    blendList = []
    blendAnswerList = []

    def inputTrie():
        file = open("data/dict.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()

        # input prefixly
        for word in wordList:
            word = word.strip()
            word = word.lower()
            trie[word] = word
            reverseWord = word[::-1]
            reverseTrie[reverseWord] = reverseWord

    def inputCandidate():
        file = open("data/candidates.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()
        for word in wordList:
            word = word.strip()
            word = word.lower()
            candidatesList.append(word)

    def inputBlendAnswerList():
        file = open("data/blends.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()

        for word in wordList:
            word = word.split()[0]
            word = word.lower()
            blendAnswerList.append(word)

    def splitWord(word):
        length = len(word)
        prefix = word[:int(length / 2)]
        reverseSuffix = word[int(length / 2):][::-1]
        splitedCandidate = [prefix, reverseSuffix]
        return splitedCandidate

    def equal(replace, match, char1, char2):
        if char1 == char2:
            return match
        else:
            return replace

    def localEditDistance(word1, word2):
        A = [[]]
        deletion = -1
        insertion = -1
        replace = -1
        match = 1

        word1Length = len(word1)
        word2Length = len(word2)
        A = np.zeros([word1Length, word1Length])
        # construct matrix
        for j in range(word1Length):
            for k in range(word1Length):
                A[j][k] = max(
                    0, A[j][k - 1] + deletion, A[j - 1][k] + insertion,
                    A[j - 1][k - 1] +
                    equal(replace, match, word1[k - 1], word2[j - 1]))

        max_item = max(max(row) for row in A)
        return max_item

    def comparePrefixNDictUsingLED(pref, reverse=0):
        if reverse:
            trieList = reverseTrie
        else:
            trieList = trie
        prefixDict = trieList.keys(prefix=pref)
        threshold = len(pref) * (0.8)

        prefixFlag = 0
        for word in prefixDict:
            LEDValue = localEditDistance(pref, word)
            if LEDValue >= threshold:
                JWValue = jarowinklerSimilarity.get_jaro_distance(
                    pref, word[:int(len(pref))], winkler=True, scaling=0.1)
                if JWValue > 0.95:
                    return True
        return False

    def calAccurancy():
        truePositiveAmount = 0

        print(" ***********  blendlist : ")
        print(blendList)
        print("\n\n ")
        print(" *********** count blendlist : ")
        print(len(blendList))
        print("\n\n ")

        for word in blendList:
            if word in blendAnswerList:
                truePositiveAmount += 1

        print(" ***********  truePositive : ")
        print(truePositiveAmount)
        print("\n\n ")

        # precision
        precision = float(truePositiveAmount) / len(blendList)
        recall = float(truePositiveAmount) / (len(blendAnswerList) - 32)

        print("***** Precision is : ")
        print(precision)
        print("\n\n")
        print("***** Recall is : ")
        print(recall)
        print("\n\n")

    # input
    inputTrie()
    inputCandidate()
    inputBlendAnswerList()

    # append filtered candidates to blendList
    for word in candidatesList:
        prefix, reverseSuffix = splitWord(word)
        if comparePrefixNDictUsingLED(
                prefix, 0) and comparePrefixNDictUsingLED(reverseSuffix, 1):
            blendList.append(word)

    calAccurancy()