def check_tf_year(corpus_ind_dir, query_list): tf = {} ix = index.open_dir(corpus_ind_dir) #load index with ix.searcher() as searcher: for t in query_list: t = re.sub(r'[^a-zA-Z0-9_ ]', '', t) splitted = t.split() if len(splitted) > 1: splitted_lower = [] for st in splitted: splitted_lower.append(st.lower()) results = searcher.search( query.Phrase("content", splitted_lower), limit=None) #this is sentence frequency t = '_'.join(splitted) tf[t] = 0 for r in results: tf[t] += 1 else: results = searcher.frequency("content", t.lower()) tf[t] = results return tf
def test_boost_phrase(): schema = fields.Schema(title=fields.TEXT(field_boost=5.0, stored=True), text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta").split() w = ix.writer() for ls in permutations(domain): t = u(" ").join(ls) w.add_document(title=t, text=t) w.commit() q = query.Or([query.Term("title", u("alfa")), query.Term("title", u("bravo")), query.Phrase("text", [u("bravo"), u("charlie"), u("delta")]) ]) def boost_phrases(q): if isinstance(q, query.Phrase): q.boost *= 1000.0 return q else: return q.apply(boost_phrases) q = boost_phrases(q) with ix.searcher() as s: r = s.search(q, limit=None) for hit in r: if "bravo charlie delta" in hit["title"]: assert hit.score > 100.0
def test_searching(): with make_index().searcher() as s: def _runq(q, result, **kwargs): r = s.search(q, **kwargs) assert_equal([d["id"] for d in r], result) _runq(query.Term("text", u("format")), ["format", "vector"]) _runq(query.Term("text", u("the")), ["fieldtype", "format", "const", "vector", "stored"]) _runq(query.Prefix("text", u("st")), ["format", "vector", "stored"]) _runq(query.Wildcard("id", u("*st*")), ["stored", "const"]) _runq(query.TermRange("id", u("c"), u("s")), ["fieldtype", "format", "const"]) _runq(query.NumericRange("subs", 10, 100), ["fieldtype", "format", "vector", "scorable"]) _runq(query.Phrase("text", ["this", "field"]), ["scorable", "unique", "stored"], limit=None) _runq(query.Every(), [ "fieldtype", "format", "vector", "scorable", "stored", "unique", "const" ]) _runq(query.Every("subs"), [ "fieldtype", "format", "vector", "scorable", "stored", "unique", "const" ])
def test_phrase_andmaybe(): qp = default.QueryParser("f", None) q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"')) assert isinstance(q, query.AndMaybe) assert q[0] == query.Term("f", u("Dahmen")) assert q[1] == query.Phrase("f", [u("Besov"), u("Spaces")])
def test_vector_phrase(self): ana = analysis.StandardAnalyzer() ftype = fields.FieldType(formats.Frequency(ana), formats.Positions(ana), scorable=True) schema = fields.Schema(name=fields.ID(stored=True), value=ftype) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u"A", value=u"Little Miss Muffet sat on a tuffet") writer.add_document(name=u"B", value=u"Miss Little Muffet tuffet") writer.add_document(name=u"C", value=u"Miss Little Muffet tuffet sat") writer.add_document( name=u"D", value=u"Gibberish blonk falunk miss muffet sat tuffet garbonzo") writer.add_document(name=u"E", value=u"Blah blah blah pancakes") writer.commit() searcher = ix.searcher() def names(results): return sorted([fields['name'] for fields in results]) q = query.Phrase("value", [u"little", u"miss", u"muffet", u"sat", u"tuffet"]) sc = q.scorer(searcher) self.assertEqual(sc.__class__.__name__, "VectorPhraseScorer") self.assertEqual(names(searcher.search(q)), ["A"]) q = query.Phrase("value", [u"miss", u"muffet", u"sat", u"tuffet"]) self.assertEqual(names(searcher.search(q)), ["A", "D"]) q = query.Phrase("value", [u"falunk", u"gibberish"]) self.assertEqual(names(searcher.search(q)), []) q = query.Phrase("value", [u"gibberish", u"falunk"], slop=2) self.assertEqual(names(searcher.search(q)), ["D"]) #q = query.Phrase("value", [u"blah"] * 4) #self.assertEqual(names(searcher.search(q)), []) # blah blah blah blah q = query.Phrase("value", [u"blah"] * 3) self.assertEqual(names(searcher.search(q)), ["E"])
def make_phrase(self, fieldname, text): fieldname = fieldname or self.default_field analyzer = self._analyzer(fieldname) if analyzer: tokens = [t.copy() for t in analyzer(text, removestops=False)] self.stopped_words.update((t.text for t in tokens if t.stopped)) texts = [t.text for t in tokens if not t.stopped] else: texts = text.split(" ") return query.Phrase(fieldname, texts)
def test_posting_phrase(self): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u"A", value=u"Little Miss Muffet sat on a tuffet") writer.add_document(name=u"B", value=u"Miss Little Muffet tuffet") writer.add_document(name=u"C", value=u"Miss Little Muffet tuffet sat") writer.add_document( name=u"D", value=u"Gibberish blonk falunk miss muffet sat tuffet garbonzo") writer.add_document(name=u"E", value=u"Blah blah blah pancakes") writer.commit() searcher = ix.searcher() def names(results): return sorted([fields['name'] for fields in results]) q = query.Phrase("value", [u"little", u"miss", u"muffet", u"sat", u"tuffet"]) sc = q.scorer(searcher) self.assertEqual(sc.__class__.__name__, "PostingPhraseScorer") self.assertEqual(names(searcher.search(q)), ["A"]) q = query.Phrase("value", [u"miss", u"muffet", u"sat", u"tuffet"]) self.assertEqual(names(searcher.search(q)), ["A", "D"]) q = query.Phrase("value", [u"falunk", u"gibberish"]) self.assertEqual(names(searcher.search(q)), []) q = query.Phrase("value", [u"gibberish", u"falunk"], slop=2) self.assertEqual(names(searcher.search(q)), ["D"]) #q = query.Phrase("value", [u"blah"] * 4) #self.assertEqual(names(searcher.search(q)), []) # blah blah blah blah q = query.Phrase("value", [u"blah"] * 3) self.assertEqual(names(searcher.search(q)), ["E"])
def test_phrase_sameword(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(id=1, text=u("The film Linda Linda Linda is good")) writer.add_document(id=2, text=u("The model Linda Evangelista is pretty")) writer.commit() with ix.searcher() as s: r = s.search(query.Phrase("text", ["linda", "linda", "linda"]), limit=None) assert_equal(len(r), 1) assert_equal(r[0]["id"], 1)
def test_phrase_keywords(): schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo charlie delta")) w.add_document(text=u("bravo charlie delta echo")) w.add_document(text=u("charlie delta echo foxtrot")) w.add_document(text=u("delta echo foxtrot alfa")) w.add_document(text=u("echo foxtrot alfa bravo")) with ix.searcher() as s: q = query.Phrase("text", u("alfa bravo").split()) r = s.search(q) assert len(r) == 2 kts = " ".join(t for t, score in r.key_terms("text")) assert kts == "alfa bravo charlie foxtrot delta"
def test_phrase_order(): tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()) schema = fields.Schema(text=tfield) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() for ls in permutations(["ape", "bay", "can", "day"], 4): writer.add_document(text=u(" ").join(ls)) writer.commit() with ix.searcher() as s: def result(q): r = s.search(q, limit=None, sortedby=None) return sorted([d['text'] for d in r]) q = query.Phrase("text", ["bay", "can", "day"]) assert_equal(result(q), [u('ape bay can day'), u('bay can day ape')])
def test_phrase_multi(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta echo").split() w = None for i, ls in enumerate(permutations(domain)): if w is None: w = ix.writer() w.add_document(id=i, text=u(" ").join(ls)) if not i % 30: w.commit() w = None if w is not None: w.commit() with ix.searcher() as s: q = query.Phrase("text", ["alfa", "bravo"]) _ = s.search(q)
def check_sf(corpus_ind_dir, query_list): query_l = [] sf = [] ix = index.open_dir(corpus_ind_dir) #load index with ix.searcher() as searcher: for t in query_list: t = re.sub(r'[^a-zA-Z0-9_ ]', '', t).lower() splitted = t.split() if len(splitted) > 1: docfreq = len( searcher.search(query.Phrase("content", splitted), limit=None)) t = '_'.join(splitted) else: docfreq = searcher.doc_frequency("content", t) query_l.append(t) sf.append(docfreq) return (query_l, sf)
def test_posting_phrase(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("A"), value=u("Little Miss Muffet sat on a tuffet")) writer.add_document(name=u("B"), value=u("Miss Little Muffet tuffet")) writer.add_document(name=u("C"), value=u("Miss Little Muffet tuffet sat")) writer.add_document(name=u("D"), value=u("Gibberish blonk falunk miss muffet sat " + "tuffet garbonzo")) writer.add_document(name=u("E"), value=u("Blah blah blah pancakes")) writer.commit() with ix.searcher() as s: def names(results): return sorted([fields['name'] for fields in results]) q = query.Phrase("value", [u("little"), u("miss"), u("muffet"), u("sat"), u("tuffet")]) m = q.matcher(s) assert_equal(m.__class__.__name__, "SpanNearMatcher") r = s.search(q) assert_equal(names(r), ["A"]) assert_equal(len(r), 1) q = query.Phrase("value", [u("miss"), u("muffet"), u("sat"), u("tuffet")]) assert_equal(names(s.search(q)), ["A", "D"]) q = query.Phrase("value", [u("falunk"), u("gibberish")]) r = s.search(q) assert_equal(names(r), []) assert_equal(len(r), 0) q = query.Phrase("value", [u("gibberish"), u("falunk")], slop=2) assert_equal(names(s.search(q)), ["D"]) q = query.Phrase("value", [u("blah")] * 4) assert_equal(names(s.search(q)), []) # blah blah blah blah q = query.Phrase("value", [u("blah")] * 3) m = q.matcher(s) assert_equal(names(s.search(q)), ["E"])
def test_phrase_score(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("A"), value=u("Little Miss Muffet sat on a tuffet")) writer.add_document(name=u("D"), value=u("Gibberish blonk falunk miss muffet sat " + "tuffet garbonzo")) writer.add_document(name=u("E"), value=u("Blah blah blah pancakes")) writer.add_document(name=u("F"), value=u("Little miss muffet little miss muffet")) writer.commit() with ix.searcher() as s: q = query.Phrase("value", [u("little"), u("miss"), u("muffet")]) m = q.matcher(s) assert_equal(m.id(), 0) score1 = m.weight() assert score1 > 0 m.next() assert_equal(m.id(), 3) assert m.weight() > score1
def test_phrase_score(self): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u"A", value=u"Little Miss Muffet sat on a tuffet") writer.add_document( name=u"D", value=u"Gibberish blonk falunk miss muffet sat tuffet garbonzo") writer.add_document(name=u"E", value=u"Blah blah blah pancakes") writer.add_document(name=u"F", value=u"Little miss muffet little miss muffet") writer.commit() searcher = ix.searcher() q = query.Phrase("value", [u"little", u"miss", u"muffet"]) sc = q.scorer(searcher) self.assertEqual(sc.id, 0) score1 = sc.score() self.assert_(score1 > 0) sc.next() self.assertEqual(sc.id, 3) self.assert_(sc.score() > score1)
def filter_corpus(corpus_ind_dir, query_list, year_from, year_to): ix = index.open_dir(corpus_ind_dir) #load index with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) term_list_T = [] term_list_Y = [] for t in query_list: t = re.sub(r'[^a-zA-Z0-9_ ]', '', t).lower() splitted = t.split() if len(splitted) > 1: term_list_T.append(query.Phrase("content", splitted)) else: term_list_T.append(query.Term("content", t)) for y in range(year_from, year_to + 1): term_list_Y.append(query.Term("year", str(y))) q1 = query.Or(term_list_T) q2 = query.Or(term_list_Y) q_f = query.And([q1, q2]) results = searcher.search(q_f, limit=None) result_list = [] relevant_article_ids = [] i = 0 for r in results: i += 1 article_id = r["id"].split('_')[0] if not article_id in relevant_article_ids: relevant_article_ids.append(article_id) new_corpus = [] for r_article_id in sorted(relevant_article_ids): article_id = r_article_id + "_" q = query.Prefix("id", article_id) x = 0 row_data = {} for r in searcher.search(q, limit=None): if x == 0: for key in r: if key == "content": row_data["sentences"] = r['content'] x += 1 elif key == "id": row_data["id"] = article_id[:-1] else: row_data[key] = r[key] else: sent = " " + r['content'] row_data["sentences"] += sent new_corpus.append(row_data) pd_save = pd.DataFrame.from_records(new_corpus) cols = ['id'] + [col for col in pd_save if col != 'id'] pd_save = pd_save[cols] return pd_save.to_csv(encoding='utf-8')
def check_group_sf_year(corpus_ind_dir, query_list, group_all): ix = index.open_dir(corpus_ind_dir) #load index with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) term_list_T = [] term_list_T_AND = [] for t in query_list: if "+" in t: #AND t_list = t.split("+") term_list_T_AND = [] for tx in t_list: tx = re.sub(r'[^a-zA-Z0-9 ]', ' ', tx).lower() splitted = tx.split() if len(splitted) > 1: term_list_T_AND.append( query.Phrase("content", splitted)) else: term_list_T_AND.append(query.Term("content", tx)) term_list_T.append(query.And(term_list_T_AND)) #AND elif "/" in t: #OR t_list = t.split("/") term_list_T_OR = [] for tx in t_list: tx = re.sub(r'[^a-zA-Z0-9 ]', ' ', tx).lower() splitted = tx.split() if len(splitted) > 1: term_list_T_OR.append(query.Phrase( "content", splitted)) else: term_list_T_OR.append(query.Term("content", tx)) term_list_T.append(query.Or(term_list_T_OR)) #AND else: #single term t = re.sub(r'[^a-zA-Z0-9 ]', ' ', t) splitted = t.split() if len(splitted) > 1: splitted_lower = [] for st in splitted: splitted_lower.append(st.lower()) term_list_T.append( query.Phrase("content", splitted_lower())) else: term_list_T.append(query.Term("content", t.lower())) if group_all: q = query.And(term_list_T) else: q = query.Or(term_list_T) results = searcher.search(q, limit=None) sf = {} df = {} for r in results: y = int(r['year']) article_id = r["id"].split('_')[0] if y in sf: sf[y] += 1 if not article_id in df[y]: df[y].append(article_id) else: sf[y] = 1 df[y] = [article_id] for y in df: df[y] = len(df[y]) return (sf, df)
def check_df_year(corpus_ind_dir, query_list, year_from, year_to): sf = {} df = {} rel_article_no = {} term_list_Y = [] ix = index.open_dir(corpus_ind_dir) #load index for y in range(year_from, year_to + 1): term_list_Y.append(query.Term("year", str(y))) q2 = query.Or(term_list_Y) with ix.searcher() as searcher: for t in query_list: relevant_article_ids = [] if "+" in t: #AND t_list = t.split("+") term_list_T_AND = [] for tx in t_list: tx = re.sub(r'[^a-zA-Z0-9 ]', ' ', tx).lower() splitted = tx.split() if len(splitted) > 1: term_list_T_AND.append( query.Phrase("content", splitted)) else: term_list_T_AND.append(query.Term("content", tx)) q1 = query.And(term_list_T_AND) elif "/" in t: #AND t_list = t.split("/") term_list_T_OR = [] for tx in t_list: tx = re.sub(r'[^a-zA-Z0-9 ]', ' ', tx).lower() splitted = tx.split() if len(splitted) > 1: term_list_T_OR.append(query.Phrase( "content", splitted)) else: term_list_T_OR.append(query.Term("content", tx)) q1 = query.Or(term_list_T_OR) else: # single term tx = t tx = re.sub(r'[^a-zA-Z0-9_ ]', '', tx).lower() splitted = tx.split() if len(splitted) > 1: q1 = query.Phrase("content", splitted) tx = '_'.join(splitted) else: q1 = query.Term("content", tx) q_f = query.And([q1, q2]) results = searcher.search(q_f, limit=None) t = t.replace(" ", "_") sf[t] = {} df[t] = {} for r in results: y = int(r["year"]) if y in df[t]: sf[t][y] += 1 else: sf[t][y] = 1 article_id = r["id"].split('_')[0] if not y in df[t]: df[t][y] = [article_id] else: if not article_id in df[t][y]: df[t][y].append(article_id) if not article_id in relevant_article_ids: relevant_article_ids.append(article_id) df_f = {} for t in df: df_f[t] = {} for y in df[t]: df_f[t][y] = len(df[t][y]) rel_article_no[t] = len(relevant_article_ids) return [sf, rel_article_no, df_f]
def search_corpus( corpus_ind_dir, query_list, year_from, year_to, top_n=1000): #the query term in the list will be connected by OR import time start = time.time() ix = index.open_dir(corpus_ind_dir) #load index with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) term_list_T = [] term_list_Y = [] for t in query_list: t = re.sub(r'[^a-zA-Z0-9_ ]', '', t).lower() splitted = t.split() if len(splitted) > 1: term_list_T.append(query.Phrase("content", splitted)) else: term_list_T.append(query.Term("content", t)) for y in range(year_from, year_to + 1): term_list_Y.append(query.Term("year", str(y))) q1 = query.Or(term_list_T) q2 = query.Or(term_list_Y) q_f = query.And([q1, q2]) # search the index results = searcher.search(q_f, limit=None) result_list = [] full_sents = [] relevant_article_ids = [] i = 0 for r in results: i += 1 article_id = r["id"].split('_')[0] if not article_id in relevant_article_ids: relevant_article_ids.append(article_id) if i <= top_n: row_data = {} row_data["id"] = r["id"] row_data["year"] = r["year"] row_data["sentence"] = r["content"].lower() #snipet row_data["title"] = r["title"].lower() row_data["author"] = r["author"] row_data["document"] = r["content"].lower() for key in r: if key in ["content", "id", "title", "year", "author"]: continue else: row_data[key] = r[key] row_data["score"] = round(r.score, 3) result_list.append(row_data) full_sents.append({"sentence": row_data["document"]}) else: break with open(corpus_ind_dir + "/doc_num") as f: total_doc_no = 0 lines = f.readlines() for line in lines: doc_num = line.strip().split() if len(doc_num) >= 2: if ((int(doc_num[0]) >= year_from) & (int(doc_num[0]) <= year_to)): total_doc_no += int(doc_num[1]) f.close() with open(corpus_ind_dir + "/sent_num") as f: total_sent_no = 0 lines = f.readlines() for line in lines: sent_num = line.strip().split() if ((int(sent_num[0]) >= year_from) & (int(sent_num[0]) <= year_to)): total_sent_no += int(sent_num[1]) f.close() print("Results returned:", time.time() - start) return [ result_list, full_sents, len(results), total_sent_no, len(relevant_article_ids), total_doc_no ]