def check_tf_year(corpus_ind_dir, query_list):

    tf = {}
    ix = index.open_dir(corpus_ind_dir)  #load index
    with ix.searcher() as searcher:
        for t in query_list:
            t = re.sub(r'[^a-zA-Z0-9_ ]', '', t)
            splitted = t.split()
            if len(splitted) > 1:
                splitted_lower = []
                for st in splitted:
                    splitted_lower.append(st.lower())
                results = searcher.search(
                    query.Phrase("content", splitted_lower),
                    limit=None)  #this is sentence frequency
                t = '_'.join(splitted)
                tf[t] = 0
                for r in results:
                    tf[t] += 1
            else:
                results = searcher.frequency("content", t.lower())

                tf[t] = results

    return tf
Example #2
0
def test_boost_phrase():
    schema = fields.Schema(title=fields.TEXT(field_boost=5.0, stored=True),
                           text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    domain = u("alfa bravo charlie delta").split()
    w = ix.writer()
    for ls in permutations(domain):
        t = u(" ").join(ls)
        w.add_document(title=t, text=t)
    w.commit()

    q = query.Or([query.Term("title", u("alfa")),
                  query.Term("title", u("bravo")),
                  query.Phrase("text", [u("bravo"), u("charlie"), u("delta")])
                  ])

    def boost_phrases(q):
        if isinstance(q, query.Phrase):
            q.boost *= 1000.0
            return q
        else:
            return q.apply(boost_phrases)
    q = boost_phrases(q)

    with ix.searcher() as s:
        r = s.search(q, limit=None)
        for hit in r:
            if "bravo charlie delta" in hit["title"]:
                assert hit.score > 100.0
Example #3
0
def test_searching():
    with make_index().searcher() as s:

        def _runq(q, result, **kwargs):
            r = s.search(q, **kwargs)
            assert_equal([d["id"] for d in r], result)

        _runq(query.Term("text", u("format")), ["format", "vector"])
        _runq(query.Term("text", u("the")),
              ["fieldtype", "format", "const", "vector", "stored"])
        _runq(query.Prefix("text", u("st")), ["format", "vector", "stored"])
        _runq(query.Wildcard("id", u("*st*")), ["stored", "const"])
        _runq(query.TermRange("id", u("c"), u("s")),
              ["fieldtype", "format", "const"])
        _runq(query.NumericRange("subs", 10, 100),
              ["fieldtype", "format", "vector", "scorable"])
        _runq(query.Phrase("text", ["this", "field"]),
              ["scorable", "unique", "stored"],
              limit=None)
        _runq(query.Every(), [
            "fieldtype", "format", "vector", "scorable", "stored", "unique",
            "const"
        ])
        _runq(query.Every("subs"), [
            "fieldtype", "format", "vector", "scorable", "stored", "unique",
            "const"
        ])
def test_phrase_andmaybe():
    qp = default.QueryParser("f", None)

    q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"'))
    assert isinstance(q, query.AndMaybe)
    assert q[0] == query.Term("f", u("Dahmen"))
    assert q[1] == query.Phrase("f", [u("Besov"), u("Spaces")])
Example #5
0
    def test_vector_phrase(self):
        ana = analysis.StandardAnalyzer()
        ftype = fields.FieldType(formats.Frequency(ana),
                                 formats.Positions(ana),
                                 scorable=True)
        schema = fields.Schema(name=fields.ID(stored=True), value=ftype)
        storage = RamStorage()
        ix = storage.create_index(schema)
        writer = ix.writer()
        writer.add_document(name=u"A",
                            value=u"Little Miss Muffet sat on a tuffet")
        writer.add_document(name=u"B", value=u"Miss Little Muffet tuffet")
        writer.add_document(name=u"C", value=u"Miss Little Muffet tuffet sat")
        writer.add_document(
            name=u"D",
            value=u"Gibberish blonk falunk miss muffet sat tuffet garbonzo")
        writer.add_document(name=u"E", value=u"Blah blah blah pancakes")
        writer.commit()

        searcher = ix.searcher()

        def names(results):
            return sorted([fields['name'] for fields in results])

        q = query.Phrase("value",
                         [u"little", u"miss", u"muffet", u"sat", u"tuffet"])
        sc = q.scorer(searcher)
        self.assertEqual(sc.__class__.__name__, "VectorPhraseScorer")

        self.assertEqual(names(searcher.search(q)), ["A"])

        q = query.Phrase("value", [u"miss", u"muffet", u"sat", u"tuffet"])
        self.assertEqual(names(searcher.search(q)), ["A", "D"])

        q = query.Phrase("value", [u"falunk", u"gibberish"])
        self.assertEqual(names(searcher.search(q)), [])

        q = query.Phrase("value", [u"gibberish", u"falunk"], slop=2)
        self.assertEqual(names(searcher.search(q)), ["D"])

        #q = query.Phrase("value", [u"blah"] * 4)
        #self.assertEqual(names(searcher.search(q)), []) # blah blah blah blah

        q = query.Phrase("value", [u"blah"] * 3)
        self.assertEqual(names(searcher.search(q)), ["E"])
Example #6
0
    def make_phrase(self, fieldname, text):
        fieldname = fieldname or self.default_field
        analyzer = self._analyzer(fieldname)
        if analyzer:
            tokens = [t.copy() for t in analyzer(text, removestops=False)]
            self.stopped_words.update((t.text for t in tokens if t.stopped))
            texts = [t.text for t in tokens if not t.stopped]
        else:
            texts = text.split(" ")

        return query.Phrase(fieldname, texts)
Example #7
0
    def test_posting_phrase(self):
        schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
        storage = RamStorage()
        ix = storage.create_index(schema)
        writer = ix.writer()
        writer.add_document(name=u"A",
                            value=u"Little Miss Muffet sat on a tuffet")
        writer.add_document(name=u"B", value=u"Miss Little Muffet tuffet")
        writer.add_document(name=u"C", value=u"Miss Little Muffet tuffet sat")
        writer.add_document(
            name=u"D",
            value=u"Gibberish blonk falunk miss muffet sat tuffet garbonzo")
        writer.add_document(name=u"E", value=u"Blah blah blah pancakes")
        writer.commit()

        searcher = ix.searcher()

        def names(results):
            return sorted([fields['name'] for fields in results])

        q = query.Phrase("value",
                         [u"little", u"miss", u"muffet", u"sat", u"tuffet"])
        sc = q.scorer(searcher)
        self.assertEqual(sc.__class__.__name__, "PostingPhraseScorer")

        self.assertEqual(names(searcher.search(q)), ["A"])

        q = query.Phrase("value", [u"miss", u"muffet", u"sat", u"tuffet"])
        self.assertEqual(names(searcher.search(q)), ["A", "D"])

        q = query.Phrase("value", [u"falunk", u"gibberish"])
        self.assertEqual(names(searcher.search(q)), [])

        q = query.Phrase("value", [u"gibberish", u"falunk"], slop=2)
        self.assertEqual(names(searcher.search(q)), ["D"])

        #q = query.Phrase("value", [u"blah"] * 4)
        #self.assertEqual(names(searcher.search(q)), []) # blah blah blah blah

        q = query.Phrase("value", [u"blah"] * 3)
        self.assertEqual(names(searcher.search(q)), ["E"])
Example #8
0
def test_phrase_sameword():
    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)

    writer = ix.writer()
    writer.add_document(id=1, text=u("The film Linda Linda Linda is good"))
    writer.add_document(id=2, text=u("The model Linda Evangelista is pretty"))
    writer.commit()

    with ix.searcher() as s:
        r = s.search(query.Phrase("text", ["linda", "linda", "linda"]),
                     limit=None)
        assert_equal(len(r), 1)
        assert_equal(r[0]["id"], 1)
def test_phrase_keywords():
    schema = fields.Schema(text=fields.TEXT(stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(text=u("alfa bravo charlie delta"))
        w.add_document(text=u("bravo charlie delta echo"))
        w.add_document(text=u("charlie delta echo foxtrot"))
        w.add_document(text=u("delta echo foxtrot alfa"))
        w.add_document(text=u("echo foxtrot alfa bravo"))

    with ix.searcher() as s:
        q = query.Phrase("text", u("alfa bravo").split())
        r = s.search(q)
        assert len(r) == 2
        kts = " ".join(t for t, score in r.key_terms("text"))
        assert kts == "alfa bravo charlie foxtrot delta"
Example #10
0
def test_phrase_order():
    tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer())
    schema = fields.Schema(text=tfield)
    storage = RamStorage()
    ix = storage.create_index(schema)

    writer = ix.writer()
    for ls in permutations(["ape", "bay", "can", "day"], 4):
        writer.add_document(text=u(" ").join(ls))
    writer.commit()

    with ix.searcher() as s:
        def result(q):
            r = s.search(q, limit=None, sortedby=None)
            return sorted([d['text'] for d in r])

        q = query.Phrase("text", ["bay", "can", "day"])
        assert_equal(result(q), [u('ape bay can day'), u('bay can day ape')])
Example #11
0
def test_phrase_multi():
    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
    ix = RamStorage().create_index(schema)

    domain = u("alfa bravo charlie delta echo").split()
    w = None
    for i, ls in enumerate(permutations(domain)):
        if w is None:
            w = ix.writer()
        w.add_document(id=i, text=u(" ").join(ls))
        if not i % 30:
            w.commit()
            w = None
    if w is not None:
        w.commit()

    with ix.searcher() as s:
        q = query.Phrase("text", ["alfa", "bravo"])
        _ = s.search(q)
def check_sf(corpus_ind_dir, query_list):
    query_l = []
    sf = []
    ix = index.open_dir(corpus_ind_dir)  #load index
    with ix.searcher() as searcher:
        for t in query_list:
            t = re.sub(r'[^a-zA-Z0-9_ ]', '', t).lower()
            splitted = t.split()
            if len(splitted) > 1:
                docfreq = len(
                    searcher.search(query.Phrase("content", splitted),
                                    limit=None))
                t = '_'.join(splitted)
            else:
                docfreq = searcher.doc_frequency("content", t)

            query_l.append(t)
            sf.append(docfreq)
    return (query_l, sf)
Example #13
0
def test_posting_phrase():
    schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u("A"),
                        value=u("Little Miss Muffet sat on a tuffet"))
    writer.add_document(name=u("B"), value=u("Miss Little Muffet tuffet"))
    writer.add_document(name=u("C"), value=u("Miss Little Muffet tuffet sat"))
    writer.add_document(name=u("D"),
                        value=u("Gibberish blonk falunk miss muffet sat " +
                                "tuffet garbonzo"))
    writer.add_document(name=u("E"), value=u("Blah blah blah pancakes"))
    writer.commit()

    with ix.searcher() as s:
        def names(results):
            return sorted([fields['name'] for fields in results])

        q = query.Phrase("value", [u("little"), u("miss"), u("muffet"),
                                   u("sat"), u("tuffet")])
        m = q.matcher(s)
        assert_equal(m.__class__.__name__, "SpanNearMatcher")

        r = s.search(q)
        assert_equal(names(r), ["A"])
        assert_equal(len(r), 1)

        q = query.Phrase("value", [u("miss"), u("muffet"), u("sat"),
                                   u("tuffet")])
        assert_equal(names(s.search(q)), ["A", "D"])

        q = query.Phrase("value", [u("falunk"), u("gibberish")])
        r = s.search(q)
        assert_equal(names(r), [])
        assert_equal(len(r), 0)

        q = query.Phrase("value", [u("gibberish"), u("falunk")], slop=2)
        assert_equal(names(s.search(q)), ["D"])

        q = query.Phrase("value", [u("blah")] * 4)
        assert_equal(names(s.search(q)), [])  # blah blah blah blah

        q = query.Phrase("value", [u("blah")] * 3)
        m = q.matcher(s)
        assert_equal(names(s.search(q)), ["E"])
Example #14
0
def test_phrase_score():
    schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u("A"),
                        value=u("Little Miss Muffet sat on a tuffet"))
    writer.add_document(name=u("D"),
                        value=u("Gibberish blonk falunk miss muffet sat " +
                                "tuffet garbonzo"))
    writer.add_document(name=u("E"), value=u("Blah blah blah pancakes"))
    writer.add_document(name=u("F"),
                        value=u("Little miss muffet little miss muffet"))
    writer.commit()

    with ix.searcher() as s:
        q = query.Phrase("value", [u("little"), u("miss"), u("muffet")])
        m = q.matcher(s)
        assert_equal(m.id(), 0)
        score1 = m.weight()
        assert score1 > 0
        m.next()
        assert_equal(m.id(), 3)
        assert m.weight() > score1
Example #15
0
    def test_phrase_score(self):
        schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
        storage = RamStorage()
        ix = storage.create_index(schema)
        writer = ix.writer()
        writer.add_document(name=u"A",
                            value=u"Little Miss Muffet sat on a tuffet")
        writer.add_document(
            name=u"D",
            value=u"Gibberish blonk falunk miss muffet sat tuffet garbonzo")
        writer.add_document(name=u"E", value=u"Blah blah blah pancakes")
        writer.add_document(name=u"F",
                            value=u"Little miss muffet little miss muffet")
        writer.commit()

        searcher = ix.searcher()
        q = query.Phrase("value", [u"little", u"miss", u"muffet"])
        sc = q.scorer(searcher)
        self.assertEqual(sc.id, 0)
        score1 = sc.score()
        self.assert_(score1 > 0)
        sc.next()
        self.assertEqual(sc.id, 3)
        self.assert_(sc.score() > score1)
def filter_corpus(corpus_ind_dir, query_list, year_from, year_to):
    ix = index.open_dir(corpus_ind_dir)  #load index

    with ix.searcher() as searcher:

        parser = QueryParser("content", ix.schema)
        term_list_T = []
        term_list_Y = []

        for t in query_list:
            t = re.sub(r'[^a-zA-Z0-9_ ]', '', t).lower()
            splitted = t.split()
            if len(splitted) > 1:
                term_list_T.append(query.Phrase("content", splitted))
            else:
                term_list_T.append(query.Term("content", t))

        for y in range(year_from, year_to + 1):
            term_list_Y.append(query.Term("year", str(y)))

        q1 = query.Or(term_list_T)
        q2 = query.Or(term_list_Y)

        q_f = query.And([q1, q2])

        results = searcher.search(q_f, limit=None)

        result_list = []
        relevant_article_ids = []
        i = 0

        for r in results:
            i += 1
            article_id = r["id"].split('_')[0]
            if not article_id in relevant_article_ids:
                relevant_article_ids.append(article_id)

        new_corpus = []
        for r_article_id in sorted(relevant_article_ids):
            article_id = r_article_id + "_"
            q = query.Prefix("id", article_id)
            x = 0
            row_data = {}
            for r in searcher.search(q, limit=None):
                if x == 0:
                    for key in r:
                        if key == "content":
                            row_data["sentences"] = r['content']
                            x += 1
                        elif key == "id":
                            row_data["id"] = article_id[:-1]
                        else:
                            row_data[key] = r[key]

                else:
                    sent = " " + r['content']
                    row_data["sentences"] += sent
            new_corpus.append(row_data)

        pd_save = pd.DataFrame.from_records(new_corpus)
        cols = ['id'] + [col for col in pd_save if col != 'id']
        pd_save = pd_save[cols]
        return pd_save.to_csv(encoding='utf-8')
def check_group_sf_year(corpus_ind_dir, query_list, group_all):

    ix = index.open_dir(corpus_ind_dir)  #load index

    with ix.searcher() as searcher:

        parser = QueryParser("content", ix.schema)
        term_list_T = []
        term_list_T_AND = []

        for t in query_list:
            if "+" in t:  #AND
                t_list = t.split("+")
                term_list_T_AND = []
                for tx in t_list:
                    tx = re.sub(r'[^a-zA-Z0-9 ]', ' ', tx).lower()
                    splitted = tx.split()
                    if len(splitted) > 1:
                        term_list_T_AND.append(
                            query.Phrase("content", splitted))
                    else:
                        term_list_T_AND.append(query.Term("content", tx))

                term_list_T.append(query.And(term_list_T_AND))  #AND

            elif "/" in t:  #OR
                t_list = t.split("/")
                term_list_T_OR = []
                for tx in t_list:
                    tx = re.sub(r'[^a-zA-Z0-9 ]', ' ', tx).lower()
                    splitted = tx.split()
                    if len(splitted) > 1:
                        term_list_T_OR.append(query.Phrase(
                            "content", splitted))
                    else:
                        term_list_T_OR.append(query.Term("content", tx))

                term_list_T.append(query.Or(term_list_T_OR))  #AND

            else:  #single term
                t = re.sub(r'[^a-zA-Z0-9 ]', ' ', t)
                splitted = t.split()
                if len(splitted) > 1:
                    splitted_lower = []
                    for st in splitted:
                        splitted_lower.append(st.lower())
                    term_list_T.append(
                        query.Phrase("content", splitted_lower()))
                else:
                    term_list_T.append(query.Term("content", t.lower()))

        if group_all:
            q = query.And(term_list_T)
        else:
            q = query.Or(term_list_T)

        results = searcher.search(q, limit=None)
        sf = {}
        df = {}

        for r in results:
            y = int(r['year'])
            article_id = r["id"].split('_')[0]

            if y in sf:
                sf[y] += 1
                if not article_id in df[y]:
                    df[y].append(article_id)
            else:
                sf[y] = 1
                df[y] = [article_id]

        for y in df:
            df[y] = len(df[y])

        return (sf, df)
def check_df_year(corpus_ind_dir, query_list, year_from, year_to):
    sf = {}
    df = {}
    rel_article_no = {}
    term_list_Y = []
    ix = index.open_dir(corpus_ind_dir)  #load index

    for y in range(year_from, year_to + 1):
        term_list_Y.append(query.Term("year", str(y)))

    q2 = query.Or(term_list_Y)

    with ix.searcher() as searcher:
        for t in query_list:
            relevant_article_ids = []
            if "+" in t:  #AND
                t_list = t.split("+")
                term_list_T_AND = []
                for tx in t_list:
                    tx = re.sub(r'[^a-zA-Z0-9 ]', ' ', tx).lower()
                    splitted = tx.split()
                    if len(splitted) > 1:
                        term_list_T_AND.append(
                            query.Phrase("content", splitted))
                    else:
                        term_list_T_AND.append(query.Term("content", tx))
                    q1 = query.And(term_list_T_AND)
            elif "/" in t:  #AND
                t_list = t.split("/")
                term_list_T_OR = []
                for tx in t_list:
                    tx = re.sub(r'[^a-zA-Z0-9 ]', ' ', tx).lower()
                    splitted = tx.split()
                    if len(splitted) > 1:
                        term_list_T_OR.append(query.Phrase(
                            "content", splitted))
                    else:
                        term_list_T_OR.append(query.Term("content", tx))
                    q1 = query.Or(term_list_T_OR)

            else:  # single term
                tx = t
                tx = re.sub(r'[^a-zA-Z0-9_ ]', '', tx).lower()
                splitted = tx.split()
                if len(splitted) > 1:
                    q1 = query.Phrase("content", splitted)
                    tx = '_'.join(splitted)
                else:
                    q1 = query.Term("content", tx)

            q_f = query.And([q1, q2])
            results = searcher.search(q_f, limit=None)
            t = t.replace(" ", "_")

            sf[t] = {}
            df[t] = {}
            for r in results:
                y = int(r["year"])
                if y in df[t]:
                    sf[t][y] += 1
                else:
                    sf[t][y] = 1

                article_id = r["id"].split('_')[0]

                if not y in df[t]:
                    df[t][y] = [article_id]
                else:
                    if not article_id in df[t][y]:
                        df[t][y].append(article_id)

                if not article_id in relevant_article_ids:
                    relevant_article_ids.append(article_id)

            df_f = {}
            for t in df:
                df_f[t] = {}
                for y in df[t]:
                    df_f[t][y] = len(df[t][y])

            rel_article_no[t] = len(relevant_article_ids)

    return [sf, rel_article_no, df_f]
def search_corpus(
        corpus_ind_dir,
        query_list,
        year_from,
        year_to,
        top_n=1000):  #the query term in the list will be connected by OR

    import time

    start = time.time()

    ix = index.open_dir(corpus_ind_dir)  #load index

    with ix.searcher() as searcher:

        parser = QueryParser("content", ix.schema)
        term_list_T = []
        term_list_Y = []

        for t in query_list:
            t = re.sub(r'[^a-zA-Z0-9_ ]', '', t).lower()
            splitted = t.split()
            if len(splitted) > 1:
                term_list_T.append(query.Phrase("content", splitted))
            else:
                term_list_T.append(query.Term("content", t))

        for y in range(year_from, year_to + 1):
            term_list_Y.append(query.Term("year", str(y)))

        q1 = query.Or(term_list_T)
        q2 = query.Or(term_list_Y)

        q_f = query.And([q1, q2])

        # search the index
        results = searcher.search(q_f, limit=None)

        result_list = []
        full_sents = []
        relevant_article_ids = []
        i = 0

        for r in results:
            i += 1
            article_id = r["id"].split('_')[0]
            if not article_id in relevant_article_ids:
                relevant_article_ids.append(article_id)

            if i <= top_n:
                row_data = {}

                row_data["id"] = r["id"]
                row_data["year"] = r["year"]
                row_data["sentence"] = r["content"].lower()  #snipet
                row_data["title"] = r["title"].lower()
                row_data["author"] = r["author"]
                row_data["document"] = r["content"].lower()

                for key in r:
                    if key in ["content", "id", "title", "year", "author"]:
                        continue
                    else:
                        row_data[key] = r[key]

                row_data["score"] = round(r.score, 3)

                result_list.append(row_data)
                full_sents.append({"sentence": row_data["document"]})
            else:
                break

        with open(corpus_ind_dir + "/doc_num") as f:
            total_doc_no = 0
            lines = f.readlines()

            for line in lines:
                doc_num = line.strip().split()
                if len(doc_num) >= 2:
                    if ((int(doc_num[0]) >= year_from) &
                        (int(doc_num[0]) <= year_to)):
                        total_doc_no += int(doc_num[1])

        f.close()

        with open(corpus_ind_dir + "/sent_num") as f:
            total_sent_no = 0
            lines = f.readlines()

            for line in lines:
                sent_num = line.strip().split()
                if ((int(sent_num[0]) >= year_from) &
                    (int(sent_num[0]) <= year_to)):
                    total_sent_no += int(sent_num[1])

        f.close()

        print("Results returned:", time.time() - start)
        return [
            result_list, full_sents,
            len(results), total_sent_no,
            len(relevant_article_ids), total_doc_no
        ]