예제 #1
0
 def __init__(self):
     self.table_name = "InvertedIndex"
     self.db = DBProcess('root', '19920102', 'Web2',
                         'InvertedIndex')  # 链接数据库
     self.inverted_ob = InvertedIndex()
     self.inverted_index = self.inverted_ob.create_inverted_index(
     )  # 创建倒排索引
예제 #2
0
def main(create_index=False):
    import time
    import os
    from DocCollection import DocCollection
    from InvertedIndex import InvertedIndex

    db = DocCollection()
    index = InvertedIndex(db)

    if create_index:
        index_corpus(index)
    else:
        try:
            start = time.time()
            index.load_index_from_file('./indexer/inverted_index.txt')
            end = time.time()
            print('\033[1;36;40m Load Index time: \033[0;0m', end - start)

        except FileNotFoundError:
            print('Must create and save Inverted Index first!')
            user_input = input(
                'Do you want to create Inverted Index now? y/n\n')

            if user_input.lower() == 'y':
                print('Indexing Documents . . .')
                index_corpus(index)
            else:
                exit(0)

    while (True):
        query_collection(index, db, get_docs=False)
예제 #3
0
def searchengine(directory):
    stopWords = set(stopwords.words("english"))
    # stemming
    ps = PorterStemmer()

    # create InvertedIndex obj
    invertedIndex = InvertedIndex()
    # build the corpus 
    Corp = Corpus()
    corpus = Corp.buildCorpus(directory)
    for docId in corpus: 
        doc = corpus[docId] 
        content = doc.getContent()
        # tokenize 
        tokens = word_tokenize(content)
        
        for token in tokens:
            token = token.lower()
            # apply stemming 
            token = ps.stem(token)

            # remove stopwords 
            if token in stopWords:
                continue
            # add to index 
            invertedIndex.addTerm(token, docId)
        
    return invertedIndex, corpus
예제 #4
0
    def process(url, soup):

        # Helper function from http://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text
        def visible(element):
            if element.parent.name in [
                    'style', 'script', '[document]', 'head', 'title'
            ]:
                return False
            elif re.match('<!--.*-->', str(element)):
                return False
            return True

        visible_texts = filter(visible, soup.findAll(text=True))
        text = ' '.join(visible_texts)
        e = models.Entity(
            # Hack to retrieve proper name from wikipedia pages
            name=soup.title.string.split(" - Wikipedia")[0],
            url=url,
            image_url=None,
            text=text,
            classification=None)

        # Write objects to the database. Commiting each object by itself
        # is inefficient, but who cares
        db.session.add(e)
        db.session.commit()

        # process inverted index ?
        InvertedIndex.build_inverted_index(text, url)
예제 #5
0
def main(args):
    print("Initializing...")
    documents = DocumentIndex(args.file)
    index = InvertedIndex()
    index.initialize(documents.index)
    query_manager = QueryManager(index)
    cli = CLI(documents, query_manager)
    print("Done")
    cli.initialize()
class BooleanRetrieval():

    def __init__(self):
        self.ii = InvertedIndex()
        self.pi = PositionalIndex()

    def userInterface(self):
        op = '1'
        while(op!='0'):
            print("Boolean Information Retrieval")
            print("-----------------------------")
            print("1. Positional Index")
            print("2. Inverted Index")
            print("0. Exit")

            op = input("Enter input: ")

            self.inputQuery(op)

    def inputQuery(self, op):
        
        if op == '1':
            query = input("Enter query: ")
            self.pi.loadDocuments()
            self.pi.buildDictionary()
            qb = QueryBuilder(query, positionalIndex = self.pi)
            output = qb.executeQuery()
            print(output)
            if len(qb.transformQuery())==1:
                self.printOutput(query)
            print("\n")
            
            
        elif op == '2':
            query = input("Enter boolean query: ")
            self.ii.loadDocuments()
            self.ii.buildDictionary()
            qb = QueryBuilder(query, invertedIndex = self.ii)
            output = qb.executeQuery()
            print(output)
            print("\n")
        else:
            return

    def printOutput(self, query):
        print("query: ", query)
        dictionary = self.pi.getPositionalIndex(query)
        i=0
        for k,v in dictionary.items():
            print("doc "+str(k)+":", v)
            i=i+1
        print("/n  DOCUMENTS RETURNED/n" + ":",i)   
예제 #7
0
def main():
    #All Crawling here!
    crawlSpider()
    invIndex = InvertedIndex()
    invIndex.loadPickles()

    queryObj = Query()

    os.system("clear")

    #Infinite loop
    while (1):

        print ""
        print ""
        print ""

        # invIndex.createTermFrequencyMatrix()
        queryObj.query = raw_input("Please enter a query for zackSpider: ")
        print "Your query is:", queryObj.query

        returnDocs = queryObj.parseQuery(queryObj.query,
                                         invIndex.inverted_index)
        if (returnDocs > 0):
            returnedDocs = sorted(returnDocs.items(),
                                  key=itemgetter(1),
                                  reverse=True)
            os.system("clear")
            print ""
            print ""
            print ""
            print "The following documents are ranked from highest to lowest similarity for your query: "
            print "---------------------------------------------------------------------------------------"

            print "{:<5} {:<15} {:<55} {:<10}".format('Doc', 'Similarity',
                                                      'Url', 'Preview')
            for key in returnedDocs:

                docKey = key[0] - 1
                doc = invIndex.collections_index[docKey]
                sim = key[1]
                print "{:<5} {:<15.10f} {:<55} {:<10}".format(
                    docKey, sim, doc[0], doc[1])

            print ""
            print ""

        else:
            print "No results."
            print ""
            print ""
예제 #8
0
class DBuildInvertedIndex(object):
    def __init__(self):
        self.table_name = "InvertedIndex"
        self.db = DBProcess('root', '19920102', 'Web2',
                            'InvertedIndex')  # 链接数据库
        self.inverted_ob = InvertedIndex()
        self.inverted_index = self.inverted_ob.create_inverted_index(
        )  # 创建倒排索引

    def write_index_to_db(self):
        count = 1
        for item in self.inverted_index:
            url_set = "\n".join(self.inverted_index[item][0:])
            print("INSERT INTO " + self.table_name + " VALUE(NULL, \"" + item +
                  "\",\"" + url_set + "\")")
            try:
                self.db.insert_data("INSERT INTO " + self.table_name +
                                    " VALUE(NULL ,\"" + item + "\",\"" +
                                    url_set + "\")")
                print("insert success!  %d" % count)
                count += 1
            except pymysql.err.ProgrammingError as e:
                print("INSERT INTO " + self.table_name + " VALUE(NULL, \"" +
                      item + "\",\"" + url_set + "\")")
                print(e)
        self.db.close_db()
예제 #9
0
def main():
    #All Crawling here!
    crawlSpider();
    invIndex = InvertedIndex()
    invIndex.loadPickles()

    queryObj = Query()

    os.system("clear")

    #Infinite loop
    while(1):

        print""
        print""
        print""

        # invIndex.createTermFrequencyMatrix()
        queryObj.query = raw_input("Please enter a query for zackSpider: ")
        print "Your query is:", queryObj.query

        returnDocs = queryObj.parseQuery(queryObj.query, invIndex.inverted_index)
        if (returnDocs > 0):
            returnedDocs = sorted(returnDocs.items(),key=itemgetter(1), reverse=True)
            os.system("clear")            
            print""
            print""
            print""
            print "The following documents are ranked from highest to lowest similarity for your query: "
            print"---------------------------------------------------------------------------------------" 

            print "{:<5} {:<15} {:<55} {:<10}".format('Doc', 'Similarity', 'Url','Preview')
            for key in returnedDocs:

                docKey = key[0]-1
                doc = invIndex.collections_index[docKey]
                sim = key[1]
                print "{:<5} {:<15.10f} {:<55} {:<10}".format(docKey, sim, doc[0], doc[1])

            print""
            print""

        else:
            print "No results."
            print""
            print""
예제 #10
0
def result():
    if request.method == "POST":
        word = request.form.get("Search")
        res, count_assignment, count_comparator = InvertedIndex(word)
    return render_template("result.html",
                           result=res,
                           count_ass=count_assignment,
                           count_com=count_comparator)
예제 #11
0
def get_results():
    query = request.args.get('query')
    urls = InvertedIndex.get_rank(query)
    #urls = (('google.com', 3), ('facebook.com', 1))
    res = []
    for (url, rank) in urls:
        res.append({'url': url, 'rank': rank})

    return jsonify({'results': res, 'time_ms': 30})
예제 #12
0
def main(docs_path: str, query_path: str):
    docs_files = os.scandir(docs_path)

    inv_idx = InvertedIndex()
    print(f"Indexing {docs_path}...")
    for file in docs_files:
        try:
            inv_idx.parse_file(file.path)
        except Exception as e:
            print(f"Error at {file.name}:\n\t{e}")
            raise e

    inv_idx.finalize()
    print("Inverted Index is built.")
    print("Fetching queries...")
    with open(query_path, 'r') as f:
        with open("Task_2.txt", 'w') as o:
            o.write('\n'.join([BooleanRetrieval(inv_idx, q) for q in f]))
    print("Done querying.")
    print("Collection statistics...")
    with open("Task_3.txt", 'w') as f:
        terms = list(inv_idx.index.keys())

        f.write(f"Terms with highest document frequency:\n{terms[:10]}\n")
        f.write(f"Terms with lowest document frequency:\n{terms[-10:]}")
    print("Done.")
예제 #13
0
class SearchEngine(object):
    inverted_index = InvertedIndex()

    def buildTries(self):
        f = open(MAP_PATH, "r") 
        file_link_map = json.loads(f.readlines()[0]) 

        # filter stopword and punctuation
        stop_words = set(stopwords.words('english'))
        stop_words.update(string.punctuation)

        for filename in file_link_map:
            f = open(filename)
            soup = BeautifulSoup(f.read(), 'html.parser')
            [script.extract() for script in soup.findAll('script')]
            [style.extract() for style in soup.findAll('style')]
#             print file_link_map[filename]
            words = word_tokenize(soup.get_text())
            # remove the words containing punctuation
            words = [i for i in words if all(j not in string.punctuation for j in i)]
            
            for word in words:
                if word.lower() not in stop_words and len(word) > MINIMUM_CHR and word.isdigit()==False:
                        # build compressed trie tree
                        try:
                            # remove the words whcih can't encode to ascII
                            word = word.lower().strip().encode('ascII')
                        except:
#                             print word
                            a = 1
                        else:
                            self.inverted_index.put(word, file_link_map[filename])
#                             print word.lower().strip()
        f.close()
        
    def search(self, key):
        if self.inverted_index.get(key) == None:
            return {}
        else:
            return self.inverted_index.get(key)
    
    def getRecomendKey(self, string):
        return self.inverted_index.getRecommendKey(string) 
예제 #14
0
class DBuildInvertedIndex(object):
    def __init__(self):
        self.table_name = "InvertedIndex"
        self.db = DBProcess('root', '19920102', 'Web2', 'InvertedIndex')   # 链接数据库
        self.inverted_ob = InvertedIndex()
        self.inverted_index = self.inverted_ob.create_inverted_index()     # 创建倒排索引

    def write_index_to_db(self):
        count = 1
        for item in self.inverted_index:
            url_set = "\n".join(self.inverted_index[item][0:])
            print("INSERT INTO " + self.table_name + " VALUE(NULL, \"" + item + "\",\"" + url_set + "\")")
            try:
                self.db.insert_data("INSERT INTO "+self.table_name+" VALUE(NULL ,\""+item+"\",\""+url_set+"\")")
                print("insert success!  %d" % count)
                count += 1
            except pymysql.err.ProgrammingError as e:
                print("INSERT INTO " + self.table_name + " VALUE(NULL, \"" + item + "\",\"" + url_set + "\")")
                print(e)
        self.db.close_db()
예제 #15
0
def main():
    parser = argparse.ArgumentParser(
        description='An example script to check th index')
    parser.add_argument("-t",
                        type=str,
                        required=True,
                        help="The term to extract")
    args = parser.parse_args()
    index = InvertedIndex()
    documents = read_file("cran-1400.txt")
    print("Initializing index...")
    index.initialize(documents)
    print("Lets check the number of indexed terms:")
    print(str(len(index.terms.keys())))
    print("Checking how many documents contain the word " + args.t)
    try:
        print(len(index.get_post_list(args.t)))
        print("The exact documents are:")
        for doc_id in index.get_post_list(args.t).keys():
            print(doc_id)
    except:
        print("No entry for that term")
        exit(1)
 def __init__(self):
     self.ii = InvertedIndex()
     self.pi = PositionalIndex()
예제 #17
0
    def load_inverted_index_in_memory(self, collection_stats_file,
                                      docs_meta_file, lookup_table_file,
                                      inverted_lists_file, compressed):
        """
        Loads an inverted index in memory, inverted lists are not loaded by default
        buffer collection_stats_file: Buffer for the collection stats file
        buffer docs_meta_file: Buffer for the docs meta file
        buffer lookup_table_file: Buffer for the lookup table file
        buffer inverted_lists_file: Buffer for the inverted lists file
        """
        inverted_index = InvertedIndex(self.config, compressed)

        # Load collection statistics
        collection_stats = json.load(collection_stats_file)
        inverted_index.load_collection_stats(collection_stats)

        # Load meta info for documents
        docs_meta = json.load(docs_meta_file)
        inverted_index.load_docs_meta(docs_meta)

        # Load lookup table
        lookup_table = json.load(lookup_table_file)
        inverted_index.load_lookup_table(lookup_table)

        # Load vocabulary
        inverted_index.load_vocabulary()

        # Load inverted lists only if in_memory is True
        if self.config.in_memory:
            index_map = defaultdict(InvertedList)
            for term, term_stats in lookup_table.items():
                inverted_list = index_map[term]
                inverted_list_binary = inverted_index.read_inverted_list_from_file(
                    inverted_lists_file, term_stats['posting_list_position'],
                    term_stats['posting_list_size'])
                inverted_list.bytearray_to_postings(inverted_list_binary,
                                                    compressed,
                                                    term_stats['df'])
            inverted_index.load_map(index_map)

        return inverted_index
예제 #18
0
# -*- coding:utf-8 -*-

__author__ = 'Kusamura'

from Data import Data
from InvertedIndex import InvertedIndex
from Search import Search
from Frequency import Frequency

if __name__ == '__main__':
    fileList = []  #章の一覧
    for line in open('chaps/chap_title.tsv', 'r'):  #chaps_titleから章の一覧を生成
        fileList.append(line[:-1].split('\t')[0][2:])
    dataList = []  #Dataクラス(ファイルから読み込んだデータ)のリスト
    for fileName in fileList:
        dataList.append(Data(fileName))

    index = InvertedIndex(dataList)  #転置インデックスを作成

    #	module = Search(index) #検索モジュール
    #	keys = ['retrieval', 'half-a-trillion', 'thus', 'layer', 'test', 'hoge', 'hogehoge']
    #	for key in keys:
    #		module.do(key)

    #	print index.countKeys() #辞書内の語数をカウント

    #	print Frequency().docFrequency(index) #辞書頻度

    print Frequency().termFrequency(dataList)  #単語頻度
예제 #19
0
 def create_inverted_index(self, compressed):
     """
     Creates and returns an inverted index
     bool compressed: Flag to choose between a compressed / uncompressed index
     """
     inverted_index = InvertedIndex(self.config, compressed)
     data = self.load_data()
     doc_id = -1
     for scene in data['corpus']:
         doc_id += 1
         scene_text = scene['text']
         # Filter None removes empty strings from the list after the split on space
         terms = list(filter(None, scene_text.split()))
         doc_meta = {
             'playId': scene['playId'],
             'sceneId': scene['sceneId'],
             'sceneNum': scene['sceneNum'],
             'sceneLength': len(terms)
         }
         inverted_index.update_docs_meta(doc_id, doc_meta)
         inverted_index.update_collection_stats(
             doc_length=doc_meta['sceneLength'])
         for position, term in enumerate(terms):
             inverted_index.update_map(term, doc_id, position)
     inverted_index.update_collection_stats(average_length=True)
     inverted_index.load_vocabulary()
     return inverted_index
예제 #20
0
from bs4 import BeautifulSoup
import requests
from Trie import CompressedTrie
from InvertedIndex import InvertedIndex
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import unicodedata

global cache
global inv_idx
global unused_urls
unused_urls = []
cache = CompressedTrie()
inv_idx = InvertedIndex()


def page_load(url):
    """
    Loads all data from webpage given the url 
    Parameters: string url - webpage url
    Return:     string data - webpage HTML data
    """
    r = requests.get(url)
    data = r.content
    return data


def strip_accents(text):
    """
    Replace accented letters in word with equivalent for processing in trie
    Parameters: string text - text to be processed
예제 #21
0
import time
from bs4 import BeautifulSoup
from InvertedIndex import InvertedIndex

if __name__ == "__main__":
    """
    Default Values of folder path and index-filename
    """
    path = "/home/stark/Projects/vector-space-retrieval-system/data/sample/"
    index_filename = "indexfile"

    if len(sys.argv) == 3:
        path = sys.argv[1]
        index_filename = sys.argv[2]
    start_time = time.time()
    idx = InvertedIndex()
    for filename in os.listdir(path):
        print(filename)
        file = open(path + filename)
        content = "".join(file.readlines())
        file.close()
        bs_content = BeautifulSoup(content, "html.parser")
        for doc in bs_content.find_all("doc"):
            idx.add_doc(doc)

    idx.assign_postings_list()
    idx.save()
    end_time = time.time()
    print("Total time: {}".format(end_time - start_time))
    del idx
예제 #22
0
 def __init__(self, queryString=""):
     print "Constructing Query Object!"
     self.invIndex = InvertedIndex()
     self.tfidf = TFIDF()
     self.query = queryString
예제 #23
0
                doc_id[0],
                rank,
                doc_id[1],
                "STANDARD",
            ))
            rank += 1
    file.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cutoff", type=int, default=10)
    parser.add_argument("--query", type=str, default="data/topics.51-100")
    parser.add_argument("--index", type=str, default="./indexfile.idx")
    parser.add_argument("--dict", type=str, default="./indexfile.dict")
    parser.add_argument("--output", type=str, default="resultfile")
    args = parser.parse_args()

    invidx = InvertedIndex(dictionary_filepath=args.dict,
                           postings_list_filepath=args.index)
    queryfile = args.query
    queryList, queryIdMap = get_query_list(queryfile)

    ranklist = {}
    for query in queryList:
        ranklist[query] = invidx.get_ranking_list(query)[:args.cutoff]

    write_resultfile(args.output, ranklist, queryIdMap)
    del invidx
    del ranklist
예제 #24
0
 def __init__(self):
     self.table_name = "InvertedIndex"
     self.db = DBProcess('root', '19920102', 'Web2', 'InvertedIndex')   # 链接数据库
     self.inverted_ob = InvertedIndex()
     self.inverted_index = self.inverted_ob.create_inverted_index()     # 创建倒排索引