def test_getFieldInfos(self):

        self.test_indexDocument()

        store = self.openStore()
        reader = None
        try:
            reader = DirectoryReader.open(store)
            fieldInfos = MultiFields.getMergedFieldInfos(reader)
            for fieldInfo in fieldInfos.iterator():
                self.assert_(fieldInfo.name in ['owner', 'search_name',
                                                'meta_words', 'docid', 'title'])
        
                if fieldInfo.isIndexed():
                    self.assert_(fieldInfo.name in ['owner', 'meta_words',
                                                    'docid', 'title'])

                if fieldInfo.isIndexed() and not fieldInfo.hasVectors():
                    self.assert_(fieldInfo.name in ['owner', 'meta_words',
                                                    'docid', 'title'])
        finally:
            store = self.closeStore(store, reader)
Exemple #2
0
    def test_getFieldInfos(self):

        self.test_indexDocument()

        store = self.openStore()
        reader = None
        try:
            reader = DirectoryReader.open(store)
            fieldInfos = MultiFields.getMergedFieldInfos(reader)
            for fieldInfo in fieldInfos.iterator():
                self.assert_(
                    fieldInfo.name in
                    ['owner', 'search_name', 'meta_words', 'docid', 'title'])

                if fieldInfo.isIndexed():
                    self.assert_(fieldInfo.name in
                                 ['owner', 'meta_words', 'docid', 'title'])

                if fieldInfo.isIndexed() and not fieldInfo.hasVectors():
                    self.assert_(fieldInfo.name in
                                 ['owner', 'meta_words', 'docid', 'title'])
        finally:
            store = self.closeStore(store, reader)
Exemple #3
0
def main():
    #constants
    FIELD_CONTENTS = "text"
    DOC_NAME = "identifier"
    STORE_DIR = "./full_index1"

    #take search term as command line argument
    if len(sys.argv) != 4:
        print(
            'Format should be: python search_docs.py, [term to search for], redo? y/n, window_size'
        )
        exit(0)

    #parse user input
    TERM = sys.argv[1]
    remake_df = True if sys.argv[2] == 'y' else False
    window_size = int(sys.argv[3])

    #other options
    stem_flag = True
    spell_check_flag = False

    #get dataframe
    doc_data = get_doc_df(remake_df)

    #get dictionary
    SA_dict = get_dict(stem_flag)

    print('Searching for: "' + TERM + '"')

    sa_term = []

    date_range = (1791, 1800)
    method = 'linear'  #vs 1/x

    example_flag = False

    #full_dict = pickle.load(open('./spellcheck/full_word_list.pkl'), 'rb')

    full_dict, modern_dict, map_chars, charlist = sp_ch.load_clean_word_list()

    ### replacement table
    rep_data = pickle.load(open('./spellcheck/rep_table.pkl', 'rb'))
    print(rep_data)
    rep_table = rep_data['rep_table']
    charlist = rep_data['charlist']
    try:
        map_chars = rep_data['charmap']
    except:
        map_chars = rep_data['map_chars']
        ###
    top_n = 4
    top_replacements = {}
    for cf, from_letter in enumerate(rep_table):
        sort_idx = np.argsort(from_letter)[::-1]
        #print(from_letter)
        top_rep = [sort_idx[i] for i in range(top_n)]
        #print(top_rep)
        top_replacements[charlist[cf]] = [charlist[char] for char in top_rep]

    # if not 'sentiment_vals_w_'+TERM in list(doc_data):
    if 1:  # not glob.glob('./pickles/%s_df.pkl'%TERM):
        lucene.initVM()
        searcher, reader, query = define_search_params(STORE_DIR,
                                                       FIELD_CONTENTS, TERM)

        fieldInfos = MultiFields.getMergedFieldInfos(reader)
        print(fieldInfos)
        for fieldInfo in fieldInfos.iterator():
            print(fieldInfo.name)
        # Run the query and get documents that contain the term
        docs_containing_term = searcher.search(query, reader.numDocs())

        print('Found ' + str(len(docs_containing_term.scoreDocs)) +
              ' documents with the term "' + TERM + '".')
        print('Calculating sentiment scores...')
        term_words = []
        #hits = searcher.search(query, 1)
        for hit in tqdm(docs_containing_term.scoreDocs):

            doc = searcher.doc(hit.doc)

            #get the text from each document
            doc_text = doc.get("text")  #doc.get("text")#.encode("utf-8")
            #single doc returns the score data for a single document, and a list of words that appear in the term windows for that document
            score_data, doc_words = sa.single_doc(
                TERM, doc_text, SA_dict, full_dict, top_replacements,
                window_size, spell_check_flag, example_flag, stem_flag, method)
            #print(score_data)
            term_words.append((doc.get(DOC_NAME).split('/')[-1], doc_words))
            sa_doc_score = [doc.get(DOC_NAME)] + score_data
            sa_term.append(sa_doc_score)
        sa_df = a_sa.make_sa_df(doc_data, sa_term, TERM)
        pickle.dump(sa_df, open('./pickles/%s_df.pkl' % TERM, 'wb'))
        pickle.dump(term_words, open('./pickles/%s_words.pkl' % TERM, 'wb'))
    else:
        sa_df = doc_data

    print(sa_df)

    #process dataframe for various properties (split this into specific functions later)
    use_weighted = True
    total_doc = False
    #	titles = index.get_documents(ids, ["id", "title"])
    print "\n".join(map(str, docs))
    sys.exit()

    fields = [
        DocField("id", stored=True, indexed=True),
        DocField("text", stored=True, indexed=True)
    ]
    index = Index(fields=fields)
    texts = [
        "just writing ", "what ever dude", "el dudino", "your dude", "the Dude"
    ]

    for i, text in enumerate(texts):
        index.add(id='doc_%d' % (i + 1), text=text)
    index.commit()

    ids, scores = index.search("dude+ever", ["text"], limit=10)
    print index.get_documents(ids, "id")

    # Try out some filters
    filter = TermsFilter([Term("id", "doc_2")])
    ids, scores = index.search("dude+ever", ["text"], filter, limit=10)
    print index.get_documents(ids, "id")

    fields = MultiFields.getMergedFieldInfos(index.reader).iterator()
    for f in fields:
        print f.attributes()
#	print filter.getDocIdSet(index.reader)