Esempio n. 1
0
def get_glove_vocab(path,
                    size=2000,
                    d=200,
                    variant='6B',
                    filter_to_squad=False):
    # this is a copy of the function in preprocessing.py - but we can't use it as we'd get a circular import!
    def tokenise(text):
        sents = [s for s in sent_tokenize(text)]
        tokens = [
            tok.lower() for sent in sents
            for tok in TreebankWordTokenizer().tokenize(sent)
        ]
        return tokens

    vocab = {PAD: 0, OOV: 1, SOS: 2, EOS: 3}
    if filter_to_squad:
        squad_words = set()
        squad_train = load_squad_triples(path, dev=False)
        squad_dev = load_squad_triples(path, dev=True)
        for triple in squad_train + squad_dev:
            squad_words |= set(tokenise(triple[0]))
            squad_words |= set(tokenise(triple[1]))
            squad_words |= set(tokenise(triple[2]))
    with open(path + 'glove.' + variant + '/glove.' + variant + '.' + str(d) +
              'd.txt') as fp:
        entries = fp.readlines()
    for i, row in enumerate(entries):
        if len(vocab) - 4 >= size and size > 0:
            break
        cols = row.strip().split(' ')
        if len(cols) < d + 1:
            print(row)
        if (filter_to_squad and cols[0] in squad_words) or not filter_to_squad:
            vocab[cols[0]] = len(vocab)
    return vocab
def processBody(text):
    # print('Body: ',text)
    data = re.sub(r'\{\{.*\}\}', r' ', text)
    data = tokenise(data)
    data = remove_stopwords(data)
    data = stem(data)
    # print('Body: ',data)
    return data
def processTitle(title):
    # print('Title before', title)
    title = title.lower()
    title = tokenise(title)
    title = remove_stopwords(title)
    title = stem(title)
    # print('Title: ', title)
    return title
def processCategories(text):
    data = text.split('\n')
    categories = []
    for line in data:
        if re.match(r'\[\[category', line):
            categories.append(re.sub(r'\[\[category:(.*)\]\]', r'\1', line))
    data = tokenise(' '.join(categories))
    data = remove_stopwords(data)
    data = stem(data)
    # print('Categories: ', data)
    return data
def processLinks(text):
    data = text.split('\n')
    links = []
    for line in data:
        if re.match(r'\*[\ ]*\[', line):
            links.append(line)
    data = tokenise(' '.join(links))
    data = remove_stopwords(data)
    data = stem(data)
    # print('Links: ', )
    return data
Esempio n. 6
0
def get_vocab(corpus, vocab_size=2000):
    def tokenise(text):
        sents = [s for s in sent_tokenize(text)]
        tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)]
        return tokens
    vocab = {PAD:0,OOV:1, SOS:2, EOS:3}
    word_count = defaultdict(float)
    for l in corpus:
        # for w in l.lower().split():
        for w in tokenise(l):
            word_count[w] +=1
    vocab_list = sorted(word_count, key=word_count.__getitem__,reverse=True)[:min(vocab_size,len(word_count))]
    for w in vocab_list:
        vocab[w] = len(vocab)
    return vocab
Esempio n. 7
0
 def fetch_data(self):
     self.data = tokenise(self.data_reader.read())
     self.num_examples = len(self.data)
     self._labels_to_idx()
     if self.vocab:
         self.string2idx = defaultdict(
             lambda: 0, {v: k
                         for k, v in enumerate(self.vocab)})
     else:
         self.vocab, self.string2idx = create_vocab(
             [['doctor', 'patient']] + [x[3] for x in self.data])
     if self.bow:
         self._bow_data()
     elif self.we:
         self._embed_data()
     self.reset()
def processInfo(text):
    data = text.split('\n')
    flag = -1
    info = []
    st = "}}"
    for line in data:
        if re.match(r'\{\{infobox', line):
            info.append(re.sub(r'\{\{infobox(.*)', r'\1', line))
            flag = 0
        elif flag == 0:
            if line == st:
                flag = -1
                continue
            info.append(line)
    data = tokenise(' '.join(info))
    data = remove_stopwords(data)
    data = stem(data)
    # print("Info: ", data)
    return data
Esempio n. 9
0
    glorot_limit = np.sqrt(6 / (D + len(vocab)))

    # clunky, but guarantees the order will be correct
    for id in range(len(rev_vocab)):
        word = rev_vocab[id]
        if word in glove.keys():
            embeddings.append(glove[word])
        else:
            # embeddings.append(q[id,:])
            embeddings.append(
                np.random.uniform(-glorot_limit, glorot_limit,
                                  size=(D)).tolist())
    return np.asarray(embeddings, dtype=np.float32)


if __name__ == "__main__":
    import sys
    sys.path.insert(0, "/Users/tom/Dropbox/msc-ml/project/src/")
    from preprocessing import char_pos_to_word, tokenise
    item = load_squad_dataset('./data/', False)[0]['paragraphs'][0]
    a = item['qas'][0]['answers'][0]
    context = item['context']
    toks = tokenise(context, asbytes=False)
    print(context)
    print(a)
    print(context[a['answer_start']:])
    ans_span = char_pos_to_word(context.encode(), [t.encode() for t in toks],
                                a['answer_start'])
    ans_span = (ans_span, ans_span + len(tokenise(a['text'], asbytes=False)))
    print(toks[ans_span[0]:ans_span[1]])
Esempio n. 10
0
def begin_search():
    f = open('./inverted_index/fileNumber.txt', 'r')
    global number_of_files
    number_of_files = int(f.read().strip())
    f.close()

    query_file = sys.argv[1]
    with open(query_file, 'r') as q:
        queries = q.readlines()
    data = ""
    for query in queries:
        global K
        K = query.split(', ')[0]
        K = int(K)
        query = query.split(', ')[1:]
        temp_query = ''
        for i in query:
            temp_query += i + ' '
        query = temp_query
        query = query.lower()
        start = timeit.default_timer()
        if re.match(r'[t|b|i|c|l]:', query):
            tempFields = re.findall(r'([t|b|c|i|l]):', query)
            words = re.findall(r'[t|b|c|i|l]:([^:]*)(?!\S)', query)
            # print(tempFields, words)
            fields, tokens = [], []
            si = len(words)
            i = 0
            while i < si:
                for word in words[i].split():
                    fields.append(tempFields[i])
                    tokens.append(word)
                i += 1
            tokens = remove_stopwords(tokens)
            tokens = stem(tokens)
            # print(fields, tokens)
            results = field_query_ranking(tokens, fields)
            # print(results)

        else:
            tokens = tokenise(query)
            tokens = remove_stopwords(tokens)
            tokens = stem(tokens)
            results = simple_query_ranking(tokens)
            # print(results)
        if len(results) > 0:
            results = sorted(results, key=results.get, reverse=True)
            if (len(results) > K):
                results = results[:K]
            for key in results:
                key.rstrip()
                title, title_doc_num = find_title(key)
                data += title_doc_num
                data += ', '
                # print(title_doc_num, end = ' ')
                if title is not None:
                    for i in title:
                        data += i + ' '
                        # print(i, end = ' ')
                    data = data[:-1]
        else:
            data += "No results found! Try modifying the search by reducing the length maybe?\n"
        end = timeit.default_timer()
        data += str(end - start) + ', '
        data += str((end - start) / K)
        data += '\n\n'
        # print('\n')
    # print('data', data)
    with open('queries_op.txt', 'w') as f:
        f.write(data)