def get_glove_vocab(path, size=2000, d=200, variant='6B', filter_to_squad=False): # this is a copy of the function in preprocessing.py - but we can't use it as we'd get a circular import! def tokenise(text): sents = [s for s in sent_tokenize(text)] tokens = [ tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent) ] return tokens vocab = {PAD: 0, OOV: 1, SOS: 2, EOS: 3} if filter_to_squad: squad_words = set() squad_train = load_squad_triples(path, dev=False) squad_dev = load_squad_triples(path, dev=True) for triple in squad_train + squad_dev: squad_words |= set(tokenise(triple[0])) squad_words |= set(tokenise(triple[1])) squad_words |= set(tokenise(triple[2])) with open(path + 'glove.' + variant + '/glove.' + variant + '.' + str(d) + 'd.txt') as fp: entries = fp.readlines() for i, row in enumerate(entries): if len(vocab) - 4 >= size and size > 0: break cols = row.strip().split(' ') if len(cols) < d + 1: print(row) if (filter_to_squad and cols[0] in squad_words) or not filter_to_squad: vocab[cols[0]] = len(vocab) return vocab
def processBody(text): # print('Body: ',text) data = re.sub(r'\{\{.*\}\}', r' ', text) data = tokenise(data) data = remove_stopwords(data) data = stem(data) # print('Body: ',data) return data
def processTitle(title): # print('Title before', title) title = title.lower() title = tokenise(title) title = remove_stopwords(title) title = stem(title) # print('Title: ', title) return title
def processCategories(text): data = text.split('\n') categories = [] for line in data: if re.match(r'\[\[category', line): categories.append(re.sub(r'\[\[category:(.*)\]\]', r'\1', line)) data = tokenise(' '.join(categories)) data = remove_stopwords(data) data = stem(data) # print('Categories: ', data) return data
def processLinks(text): data = text.split('\n') links = [] for line in data: if re.match(r'\*[\ ]*\[', line): links.append(line) data = tokenise(' '.join(links)) data = remove_stopwords(data) data = stem(data) # print('Links: ', ) return data
def get_vocab(corpus, vocab_size=2000): def tokenise(text): sents = [s for s in sent_tokenize(text)] tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)] return tokens vocab = {PAD:0,OOV:1, SOS:2, EOS:3} word_count = defaultdict(float) for l in corpus: # for w in l.lower().split(): for w in tokenise(l): word_count[w] +=1 vocab_list = sorted(word_count, key=word_count.__getitem__,reverse=True)[:min(vocab_size,len(word_count))] for w in vocab_list: vocab[w] = len(vocab) return vocab
def fetch_data(self): self.data = tokenise(self.data_reader.read()) self.num_examples = len(self.data) self._labels_to_idx() if self.vocab: self.string2idx = defaultdict( lambda: 0, {v: k for k, v in enumerate(self.vocab)}) else: self.vocab, self.string2idx = create_vocab( [['doctor', 'patient']] + [x[3] for x in self.data]) if self.bow: self._bow_data() elif self.we: self._embed_data() self.reset()
def processInfo(text): data = text.split('\n') flag = -1 info = [] st = "}}" for line in data: if re.match(r'\{\{infobox', line): info.append(re.sub(r'\{\{infobox(.*)', r'\1', line)) flag = 0 elif flag == 0: if line == st: flag = -1 continue info.append(line) data = tokenise(' '.join(info)) data = remove_stopwords(data) data = stem(data) # print("Info: ", data) return data
glorot_limit = np.sqrt(6 / (D + len(vocab))) # clunky, but guarantees the order will be correct for id in range(len(rev_vocab)): word = rev_vocab[id] if word in glove.keys(): embeddings.append(glove[word]) else: # embeddings.append(q[id,:]) embeddings.append( np.random.uniform(-glorot_limit, glorot_limit, size=(D)).tolist()) return np.asarray(embeddings, dtype=np.float32) if __name__ == "__main__": import sys sys.path.insert(0, "/Users/tom/Dropbox/msc-ml/project/src/") from preprocessing import char_pos_to_word, tokenise item = load_squad_dataset('./data/', False)[0]['paragraphs'][0] a = item['qas'][0]['answers'][0] context = item['context'] toks = tokenise(context, asbytes=False) print(context) print(a) print(context[a['answer_start']:]) ans_span = char_pos_to_word(context.encode(), [t.encode() for t in toks], a['answer_start']) ans_span = (ans_span, ans_span + len(tokenise(a['text'], asbytes=False))) print(toks[ans_span[0]:ans_span[1]])
def begin_search(): f = open('./inverted_index/fileNumber.txt', 'r') global number_of_files number_of_files = int(f.read().strip()) f.close() query_file = sys.argv[1] with open(query_file, 'r') as q: queries = q.readlines() data = "" for query in queries: global K K = query.split(', ')[0] K = int(K) query = query.split(', ')[1:] temp_query = '' for i in query: temp_query += i + ' ' query = temp_query query = query.lower() start = timeit.default_timer() if re.match(r'[t|b|i|c|l]:', query): tempFields = re.findall(r'([t|b|c|i|l]):', query) words = re.findall(r'[t|b|c|i|l]:([^:]*)(?!\S)', query) # print(tempFields, words) fields, tokens = [], [] si = len(words) i = 0 while i < si: for word in words[i].split(): fields.append(tempFields[i]) tokens.append(word) i += 1 tokens = remove_stopwords(tokens) tokens = stem(tokens) # print(fields, tokens) results = field_query_ranking(tokens, fields) # print(results) else: tokens = tokenise(query) tokens = remove_stopwords(tokens) tokens = stem(tokens) results = simple_query_ranking(tokens) # print(results) if len(results) > 0: results = sorted(results, key=results.get, reverse=True) if (len(results) > K): results = results[:K] for key in results: key.rstrip() title, title_doc_num = find_title(key) data += title_doc_num data += ', ' # print(title_doc_num, end = ' ') if title is not None: for i in title: data += i + ' ' # print(i, end = ' ') data = data[:-1] else: data += "No results found! Try modifying the search by reducing the length maybe?\n" end = timeit.default_timer() data += str(end - start) + ', ' data += str((end - start) / K) data += '\n\n' # print('\n') # print('data', data) with open('queries_op.txt', 'w') as f: f.write(data)