コード例 #1
0
 def endElement(self, name):
     if name == "page":
         self.current_article.processArticle()    #### This is where the processing for the article starts........starts
         Indexer.doIndex(self.current_article, self.outfile)
         
     to_store = ""
     if len(self.current_lines):
         try:
             to_store = (''.join(self.current_lines)).strip()
         except Exception:
             print "exception occured at id={0} and title={1}\n {2}".format(self.current_article.id,self.current_article.title,Exception)
     if name == "title":
         self.current_article.title = to_store
     if name == "id":
         if self.parent_element == "page":
             self.current_article.id = to_store
     if name == "text":
         self.current_article.text = to_store
     self.elements.pop()
     if self.elements:
         self.current_element = self.parent_element
         if len(self.elements) == 1:
             self.parent_element = ""
         else:
             self.parent_element = self.elements[len(self.elements)-1]
     else:
         self.current_element = ""
コード例 #2
0
    def endElement(self, name):
        if name == "page":
            self.current_article.processArticle(
            )  #### This is where the processing for the article starts........starts
            Indexer.doIndex(self.current_article, self.outfile)

        to_store = ""
        if len(self.current_lines):
            try:
                to_store = (''.join(self.current_lines)).strip()
            except Exception:
                print "exception occured at id={0} and title={1}\n {2}".format(
                    self.current_article.id, self.current_article.title,
                    Exception)
        #to_store = self.current_characters.strip()
        if name == "title":
            self.current_article.title = to_store
        if name == "id":
            if self.parent_element == "page":
                self.current_article.id = to_store
        if name == "text":
            self.current_article.text = to_store
        self.elements.pop()
        if self.elements:
            self.current_element = self.parent_element
            if len(self.elements) == 1:
                self.parent_element = ""
            else:
                self.parent_element = self.elements[len(self.elements) - 1]
        else:
            self.current_element = ""
コード例 #3
0
def main():

    print("--- RETRIEVER ---")
    print("Select")
    print("1 for BM25")
    print("2 for tf-idf")
    print("3 for Query Likelihood Model")
    user_choice = input("Enter your choice: ")
    if user_choice not in ["1", "2", "3"]:
        print("\nInvalid input. Aborting . .")
        sys.exit()

    
    # sets "RETRIEVAL_MODEL" to the user chosen model.
    set_retrieval_model(user_choice)
    
    # Create a directory to save the results.  
    # and overwrite existing run file, if any 
    os.makedirs(RUN_OUTPUTS_DIR,exist_ok=True)
    global RETRIEVAL_MODEL
    output_file = os.path.join(RUN_OUTPUTS_DIR,RETRIEVAL_MODEL+"Run.txt")
    if os.path.exists(output_file):
        os.remove(output_file)
     
    # Generate the unigram index.
    # By default, not performing stopping.
    # So send False
    Indexer.unigram_index(False)    

    # Fetch the index generated.
    global INVERTED_INDEX
    INVERTED_INDEX = Indexer.INVERTED_INDEX
    global DOC_TOKEN_COUNT
    DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT
      

    # Read all queries. 
    queries = extract_queries_from_file()   
    
    global QUERY_ID
    for query in queries:
        QUERY_ID +=1       
        
        # Dictionary of query term frequency
        query_term_freq = query_term_freq_map(query)
        
        # Fetch the inverted indexes corresponding to the terms
        # in the query.
        fetched_index = query_matching_index(query_term_freq)
                
        # Compute ranking scores of all docs for this query.
        doc_scores = compute_doc_scores(fetched_index, query_term_freq)
        
        # Write results to a textfile.
        output_to_file(doc_scores,QUERY_ID)        

        print("Completed Retrieval for query : " + query)

    print("End of Retrieval.")
コード例 #4
0
ファイル: PoolTable.py プロジェクト: sgrieve/PoolTable
def Get_UL_Coord(contour,pad=10):
    """
    Get the upper left coordinate of the contour.
    """
    dists = []
    for c in contour:
        dists.append(distbetween(c[0][0],c[0][1],0,0))

    return (contour[Indexer.get_index_of_min(dists)[0]][0][0]-pad,contour[Indexer.get_index_of_min(dists)[0]][0][1]-pad)
コード例 #5
0
ファイル: Controller.py プロジェクト: anaelgor/SearchEngine
def data_set_Path(corpus_path, index_path):
    global __stopwords_path
    global __corpus_path
    global __index_path
    __stopwords_path = corpus_path + "\\stop_words.txt"
    Parser.set_stop_words_file(__stopwords_path)
    __corpus_path = corpus_path
    __index_path = index_path
    Indexer.set_path_to_postiong_files(__index_path)
コード例 #6
0
ファイル: PoolTable.py プロジェクト: sgrieve/PoolTable
def Get_LR_Coord(contour,imgXmax,imgYmax, pad=10):    
    """
    Get the lower right coordinate of the contour.
    """
    dists = []
    for c in contour:
        dists.append(distbetween(c[0][0],c[0][1],imgXmax,imgYmax))

    return (contour[Indexer.get_index_of_min(dists)[0]][0][0]+pad,contour[Indexer.get_index_of_min(dists)[0]][0][1]+pad)
コード例 #7
0
ファイル: PoolTable.py プロジェクト: sealghost/PoolTable
def Get_UL_Coord(contour, pad=10):
    """
    Get the upper left coordinate of the contour.
    """
    dists = []
    for c in contour:
        dists.append(distbetween(c[0][0], c[0][1], 0, 0))

    return (contour[Indexer.get_index_of_min(dists)[0]][0][0] - pad,
            contour[Indexer.get_index_of_min(dists)[0]][0][1] - pad)
コード例 #8
0
ファイル: PoolTable.py プロジェクト: sealghost/PoolTable
def Get_LR_Coord(contour, imgXmax, imgYmax, pad=10):
    """
    Get the lower right coordinate of the contour.
    """
    dists = []
    for c in contour:
        dists.append(distbetween(c[0][0], c[0][1], imgXmax, imgYmax))

    return (contour[Indexer.get_index_of_min(dists)[0]][0][0] + pad,
            contour[Indexer.get_index_of_min(dists)[0]][0][1] + pad)
コード例 #9
0
def main():

    print("--- RETRIEVAL WITH SNIPPETS ---")
   
    # Generate the unigram index.
    # By default, not performing stopping.
    # So send False
    Indexer.unigram_index(False)    

    # Fetch the index generated.
    global INVERTED_INDEX
    INVERTED_INDEX = Indexer.INVERTED_INDEX
    global DOC_TOKEN_COUNT
    DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT
       

    # Read all queries. 
    queries = extract_queries_from_file()   
    

    global QUERY_ID
    for query in queries:
        QUERY_ID +=1       
        
        # Dictionary of query term frequency
        query_term_freq = query_term_freq_map(query)
        
        # Fetch the inverted indexes corresponding to the terms
        # in the query.
        fetched_index = query_matching_index(query_term_freq)

        # Compute BM25 score for this query.
        doc_scores = BM25_score(fetched_index, query_term_freq)
        
        # Extract top 100 docs, for which we need
        # to generate snippets.
        sorted_scores = [(k, doc_scores[k]) for k in sorted(doc_scores, key=doc_scores.get, reverse = True)]        
        for i in range(min(len(sorted_scores),100)):
            k,v = sorted_scores[i]
            doc_list.append(k)
        
        
        ''' top100 = Counter(doc_scores).most_common(100)
        doc_list = []
        for k,v in top100: 
            doc_list.append(k) '''
        
        # Snippet generator prints snippets 
        # with query term highlighting, onto the console.
        SnippetGenerator.snippet_generator(doc_list,query, INVERTED_INDEX)
               

        print("Completed Retrieval for query : " + query)

    print("End of Retrieval with Snippets.")
コード例 #10
0
def index_retri():
    Indexer.parse_corpus()
    Indexer.build()
    number_of_terms1 = Indexer.number_of_terms1
    index_dict = Indexer.index_dict
    parsed_queries = Retrieval.read_query_doc()

    query_id = 1
    for query in parsed_queries:
        Retrieval.tf_idf(index_dict, number_of_terms1, query, query_id)
        query_id += 1
コード例 #11
0
ファイル: Controller.py プロジェクト: anaelgor/SearchEngine
def reset(param=None):
    global __corpus_path,__index_path, __stem_suffix
    if param == "Queries":
        Parser.reset()
        __corpus_path = ""
    else:
        ReadFile.reset()
        Parser.reset()
        Indexer.reset()
        remove_index_files()
        __stem_suffix = ''
        __corpus_path = ""
        __index_path = ""
コード例 #12
0
def task3a(model, raw_corpus_directory):
    project_directory = os.getcwd()
    p = Indexer.Parser()
    corpus_directory = p.build_corpus(raw_corpus_directory, stopped=True)
    output_directory = os.path.join(project_directory, "output")

    I = Indexer.InvertedIndexer(corpus_directory)
    I.ngram_indexer(1)  # builds a unigram indexes for each word
    r = Retriever.Retriever(
        corpus_directory, I, project_directory
    )  # create a Retriever class, which contains different retrieval model

    os.chdir(raw_corpus_directory)
    os.chdir(os.pardir)
    f = open('cacm.query.txt', 'r')
    soup = BeautifulSoup(f.read(), 'html.parser')
    f.close()

    f_stop_words = open('common_words.txt', 'r')
    stop_words_list = f_stop_words.readlines()
    stop_words = [i.strip() for i in stop_words_list]
    f_stop_words.close()
    file_name = os.path.join(output_directory, 'task3a_' + model + '.txt')
    f = open(file_name, 'w')  # open file for writing results
    for i in range(64):
        query_no = (soup.find('docno')).text.encode(
            'utf-8')  # extract query number and query
        (soup.find('docno')).decompose()
        query = (soup.find('doc')).text.encode('utf-8')
        (soup.find('doc')).decompose()

        r.process_query(query, stopped=True,
                        stopwords=stop_words)  # parse the query
        # r.clean_content(query)
        docs_and_scores = r.get_scores_for_docs(
            model, int(query_no))  # retrieve relevant documents

        # save results into appropriate file
        docs = docs_and_scores[0]
        scores = docs_and_scores[1]
        for i in range(100):
            f.write(str(query_no) \
                        + " Q0 " \
                        + str(docs[i]) + ' ' \
                        + str((i+1)) + " " \
                        + str(scores[i]) + " " \
                        + model + "\n")
    f.close()
コード例 #13
0
def main():

    print("--- PSEUDO RELEVANCE RETRIEVER ---")

    # Create a directory to save the results.  
    # and overwrite existing run file, if any 
    os.makedirs(RUN_OUTPUTS_DIR,exist_ok=True)    
    output_file = os.path.join(RUN_OUTPUTS_DIR,"PseudoRelBM25Run.txt")
    if os.path.exists(output_file):
        os.remove(output_file)

    # Generate the unigram index.
    # By default, not performing stopping.
    # So send False
    Indexer.unigram_index(False)    

    # Fetch the index generated.
    global INVERTED_INDEX
    INVERTED_INDEX = Indexer.INVERTED_INDEX
    global DOC_TOKEN_COUNT
    DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT

    # Read all queries. 
    queries = extract_queries_from_file()   

    global QUERY_ID
    for query in queries:
        QUERY_ID +=1       
        
        # Dictionary of query term frequency
        query_term_freq = query_term_freq_map(query)
        
        # Fetch the inverted indexes corresponding to the terms
        # in the query.
        fetched_index = query_matching_index(query_term_freq)
                
        # Compute BM25 score for this query term.
        doc_scores = BM25_score(fetched_index, query_term_freq)

        # Update doc scores using Pseudo-relevance
        doc_scores = pseudo_relevance(query,doc_scores)
        
        # Write results to a textfile.
        output_to_file(doc_scores,QUERY_ID)        

        print("Completed Retrieval for query : " + query)

    print("End of Pseudo Relevance retrieval.")
コード例 #14
0
    def findSimilar(self,link,limit):

        # we call the read text function from the Crawler to read the new link
        # we use the construcotr with empty variables
        crawler = Crawler.Crawler('',0,0,0)


        self.limit = limit
        file = open("Data/page%d.txt" % self.limit, 'w')

        try:
            self.title , text = crawler.getText(link)
            # we combine the lists of string to a single string
            text = ''.join(text)
            for t in text:
                file.write(t)
            file.close()
        except:
            "Link is not accesible"
            file.close()
            sys.exit(0)

        indexer = Indexer.Indexer()
        indexer.start()

        cosineSimilarity = indexer.getCosineSimilarity()



        linksId = [ i for i in range(self.limit)]

        linksIdSorted = [x for _,x in sorted(zip(cosineSimilarity,linksId),reverse=True)]

        return cosineSimilarity , linksIdSorted
コード例 #15
0
ファイル: Controller.py プロジェクト: xiamaz/pyIndexer
 def __init__(self):
     # initialize the Controller object, first we would need to initialize
     # the configuration system
     # the configuration is saved in the appdata system
     self.configuration = Configuration.Configuration()
     self.crawler = Indexer.FileCrawler()
     self.sep = os.sep
コード例 #16
0
def task3b(model):
    output_directory = os.path.join(os.getcwd(), "output")
    stemmed_corpus = get_stemmed_corpus()
    f = open('cacm_stem.query.txt', 'r')
    stemmed_queries = f.readlines()
    f.close()

    I = Indexer.InvertedIndexer('')
    I.stemmed_indexer(stemmed_corpus)
    r = Retriever.Retriever('', I, os.getcwd())
    file_name = os.path.join(output_directory,
                             'task3b_' + model + '_stemmed.txt')
    f = open(file_name, 'w')
    query_no = [12, 13, 19, 23, 24, 25, 50]
    q_iter = 0
    for each_query in stemmed_queries:
        r.process_query(each_query)
        docs_and_scores = r.get_scores_for_docs(model, query_no[q_iter])

        # save results into appropriate file
        docs = docs_and_scores[0]
        scores = docs_and_scores[1]
        for i in range(100):
            f.write(str(query_no[q_iter]) \
                    + " Q0 " \
                    + str(docs[i]) + ' ' \
                    + str((i + 1)) + " " \
                    + str(scores[i]) + " " \
                    + model + "\n")
        q_iter += 1
    f.close()
コード例 #17
0
ファイル: nSyn.py プロジェクト: neo1989/nSyn
def localIndexer():
    d = eval('local("tree -Nif %s",True)' % LOCAL_DIR)
    filelist = d.split('\n')
    for path in filelist[1:-2]:
        path = _strEncode(path)
        index = path.replace(r'%s' % LOCAL_DIR,'')
        r = eval('local("test -d %s || md5sum %s",True)' % (path,path))
        if not r:
            md5sum = 0
            lastModification = 0
            isDir = 1
        else :
            md5sum = r.split('  ')[0] 
            lastModification = int(eval('local("stat -c %Y '+ path +'",True)'))
            isDir = 0

        Indexer.addLocal(index,md5sum,lastModification,isDir)
コード例 #18
0
ファイル: nSyn.py プロジェクト: neo1989/nSyn
def remoteIndexer():
    d = eval('run("tree -Nif %s")' % REMOTE_DIR)
    filelist = d.split('\r\n')
    for path in filelist[1:-2]:
        path = _strEncode(path)
        index = path.replace(r'%s' % REMOTE_DIR,'') 
        r = eval('run("test -d %s || md5sum %s")' % (path,path))
        if not r:
            md5sum = 0
            lastModification = 0
            isDir = 1
        else :
            md5sum = r.split('  ')[0] 
            lastModification = int(eval('run("stat -c %Y '+ path +'")'))
            isDir = 0
        
        Indexer.addRemote(index,md5sum,lastModification,isDir)
コード例 #19
0
def main():
    global stop_words
    global parsed_queries

    Indexer.read_stop_words()
    Retrieval.read_query_doc()
    # generate the unigrams from the lucene results for k=10 or 20
    stop_words = Indexer.stop_words
    parsed_queries = Retrieval.parsed_queries

    # evaluating the dice coefficient on the search query with n=8 or 6
    query_id = 0
    for query in parsed_queries:
        query_id = query_id + 1
        result = generateUnigramDocFreq(stop_words, query_id)
        runSearchQuery(query.lower(), result, 8)

    Retrieval.write_queries_to_file("expanded-queries", expanded_queries)
コード例 #20
0
def read_query_doc():
    global parsed_queries
    fr = open('test-collection/cacm.query.txt', 'r')
    queries = fr.read()
    while queries.find('<DOC>') != -1:
        query = queries[queries.find('</DOCNO>') + 8:queries.find('</DOC>')]
        query = query.strip().replace('\n', " ")
        new_query = Indexer.transformText(query, True, True)
        parsed_queries.append(new_query)
        queries = queries[queries.find('</DOC>') + 6:]
    write_queries_to_file("parsed-queries", parsed_queries)
コード例 #21
0
ファイル: PoolTable.py プロジェクト: sgrieve/PoolTable
def GetClothColor(hsv,search_width=45):
    """
    Find the most common HSV values in the image.
    In a well lit image, this will be the cloth
    """

    hist = cv2.calcHist([hsv], [1], None, [180], [0, 180])
    h_max = Indexer.get_index_of_max(hist)[0]
    
    hist = cv2.calcHist([hsv], [1], None, [256], [0, 256])
    s_max = Indexer.get_index_of_max(hist)[0]
    
    hist = cv2.calcHist([hsv], [2], None, [256], [0, 256])
    v_max = Indexer.get_index_of_max(hist)[0]

    # define range of blue color in HSV
    lower_color = np.array([h_max-search_width,s_max-search_width,v_max-search_width])
    upper_color = np.array([h_max+search_width,s_max+search_width,v_max+search_width])

    return lower_color, upper_color
コード例 #22
0
ファイル: main.py プロジェクト: omrikiantman/ir_bgu
def run_index():
    # run an entire index build
    global docs_path
    global postings_path
    global is_stemming
    global indexer
    global dict_cache_path
    try:
        # check validation conditions
        if (not check_corpus_directory(docs_path.get())) or (not check_postings_directory(postings_path.get())):
            return
        result = tkMessageBox.askquestion("Run Index",
                                          "Are you sure?\n dont worry if the GUI"
                                          " is stuck or not responding - it is working", icon='warning')
        if result != 'yes':
            return
        print ('START TIME - ' + time.strftime("%H:%M:%S"))
        start_time = datetime.now()
        # reset the current memory of the project
        if (globs.main_dictionary is not None) and (bool(globs.main_dictionary)):
            globs.main_dictionary.clear()
        if (globs.cache is not None) and (bool(globs.cache)):
            globs.cache.clear()
        if (globs.documents_dict is not None) and (bool(globs.documents_dict)):
            globs.documents_dict.clear()
        # start indexing
        globs.stop_words = load_stop_words(docs_path.get())
        indexer = Indexer.Indexer(postings_path.get(), is_stemming.get())
        read_file = ReadFile.ReadFile(get_corpus_dir(docs_path.get()),
                                      indexer, globs.constants, globs.stop_words, is_stemming.get())
        read_file.index_folder()
        globs.num_of_documents = len(read_file.documents_dict)

        globs.documents_dict = read_file.documents_dict
        del read_file
        indexer.unite_temp_postings()
        globs.main_dictionary = indexer.main_dict
        indexer.build_document_weight(globs.documents_dict)
        # in case want to print stats, uncomment this
        # with open('{}{}'.format('stats', 'stem' if is_stemming.get() else ''),'w') as my_stats_file:
        #    my_stats_file.write('term,tf,df\n'.format())
        #    for key,val in main_dictionary.iteritems():
        #        my_stats_file.write('{},{},{}\n'.format(key,val.tf,val.df))
        globs.cache = indexer.cache_dict
        globs.average_doc_size = globs.average_doc_size/globs.num_of_documents
        dict_cache_path = postings_path
        print ('END TIME - ' + time.strftime("%H:%M:%S"))
        end_time = datetime.now()
        print_stats_at_end_of_indexing(end_time - start_time)
    except Exception as err:
        tkMessageBox.showinfo('ERROR', err)
        traceback.print_exc(file=stdout)
コード例 #23
0
ファイル: PoolTable.py プロジェクト: sealghost/PoolTable
def MaskTableBed(contours):
    """
    Mask out the table bed, assuming that it will be the biggest contour.
    """

    #The largest area should be the table bed
    areas = []
    for c in contours:
        areas.append(cv2.contourArea(c))

    #return the contour that delineates the table bed
    largest_contour = Indexer.get_index_of_max(areas)
    return contours[largest_contour[0]]
コード例 #24
0
ファイル: PoolTable.py プロジェクト: sgrieve/PoolTable
def MaskTableBed(contours):
    """
    Mask out the table bed, assuming that it will be the biggest contour.
    """
            
    #The largest area should be the table bed    
    areas = []    
    for c in contours:
        areas.append(cv2.contourArea(c))
    
    #return the contour that delineates the table bed
    largest_contour = Indexer.get_index_of_max(areas)
    return contours[largest_contour[0]]
コード例 #25
0
def change_index(request):
    ans = URL.objects.all()
    cur_page = []
    for i in ans:
        cur_page.append((str(i.link), str(i.description)))
    if request.GET.get('request') == 'page':
        num_page = int(request.GET.get('num'))
        return HttpResponse(json.dumps({'urls': cur_page[5 * (num_page - 1): num_page * 5]}), content_type="application/json")
    elif request.GET.get('request') == 'save':
        print "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
        name = base64.b16encode(request.GET.get('url'))
        f = open('/home/katrin/databasetemp/'+name, "w")
        f.write(request.GET.get('txt'))

        try:
            d.start('/home/katrin/databasetemp/')
        except Exception, e:
            print e
        URL.objects.filter(link=request.GET.get('url')).delete()
        new_link = URL(link=request.GET.get('url'), description=request.GET.get('txt'))
        new_link.save()
        return  HttpResponse(json.dumps('ok'), content_type="application/json")
コード例 #26
0
ファイル: PoolTable.py プロジェクト: sealghost/PoolTable
def GetClothColor(hsv, search_width=45):
    """
    Find the most common HSV values in the image.
    In a well lit image, this will be the cloth
    """

    hist = cv2.calcHist([hsv], [1], None, [180], [0, 180])
    h_max = Indexer.get_index_of_max(hist)[0]

    hist = cv2.calcHist([hsv], [1], None, [256], [0, 256])
    s_max = Indexer.get_index_of_max(hist)[0]

    hist = cv2.calcHist([hsv], [2], None, [256], [0, 256])
    v_max = Indexer.get_index_of_max(hist)[0]

    # define range of blue color in HSV
    lower_color = np.array(
        [h_max - search_width, s_max - search_width, v_max - search_width])
    upper_color = np.array(
        [h_max + search_width, s_max + search_width, v_max + search_width])

    return lower_color, upper_color
コード例 #27
0
def checkInIndexFileWordMap(term):
    pos = bisect.bisect(sortedIndexFileWordMapKeys,term)
    if pos > 0:
        pos = pos - 1
    key = sortedIndexFileWordMapKeys[pos]
    index = indexFileWordMap[key]
    with bz2.BZ2File("{0}.index{1}.bz2".format(infile,index), 'rb', compresslevel=9) as ipartF:
        for line in ipartF:
            if line.startswith("{0}=".format(term)):
                parts = line.strip().split("=")
                if len(parts) == 2:
                    ffo = Indexer.getFOFromLine(parts[1])
                    return ffo
    return {}
コード例 #28
0
ファイル: nSyn.py プロジェクト: neo1989/nSyn
def parallel():
    remote_indexes = Indexer.allRemote()
    for r in remote_indexes:
        l = Indexer.oneLocal(r.index)
        if not l:
            if r.isDir == 1:
                local_dir_to_create.append(r.index)
            else:
                files_from_remote.append(r.index)
        else:
            if r.isDir == 0 and r.md5sum != l.md5sum and r.lastModification > l.lastModification:
                files_from_remote.append(r.index)

    local_indexes = Indexer.allLocal()
    for l in local_indexes:
        r = Indexer.oneRemote(l.index)
        if not r:
            if l.isDir == 1:
                remote_dir_to_create.append(l.index)
            else:
                files_from_local.append(l.index)
        else:
            if l.isDir == 0 and l.md5sum != r.md5sum and l.lastModification > r.lastModification:
                files_from_local.append(l.index)
コード例 #29
0
def checkInIndexFileWordMap(term):
    pos = bisect.bisect(sortedIndexFileWordMapKeys,term)
    if pos > 0:
        pos = pos - 1
    key = sortedIndexFileWordMapKeys[pos]
    index = indexFileWordMap[key]
    #print "key = {0} and index = {1}".format(key,index)
    with bz2.BZ2File("{0}.index{1}.bz2".format(infile,index), 'rb', compresslevel=9) as ipartF:
        #print "checking file {0}.index{1}.bz2".format(infile,index)
        for line in ipartF:
            if line.startswith("{0}=".format(term)):
                parts = line.strip().split("=")
                if len(parts) == 2:
                    #word = parts[0]
                    ffo = Indexer.getFOFromLine(parts[1])
                    return ffo
    return {}
コード例 #30
0
ファイル: Controller.py プロジェクト: anaelgor/SearchEngine
def Main(cp, ip, to_stem):
    global __corpus_path
    global __index_path
    global doc
    global __stem_suffix
    create_city_db()
    Parser.stem = to_stem
    if to_stem is True:
        __stem_suffix = '_stem'
    else:
        __stem_suffix = ''

    start = time.time()
    data_set_Path(cp, ip)
    Indexer.create_posting_files(__stem_suffix)
    counter = 0
    for root, dirs, files in os.walk(__corpus_path):
        for file in files:
            if (stop==True):
                reset() #will clear the memory of the program %% will remove the posting files and dictionary
                return
            #print("file!!!")
            end2 = time.time()
            # if ((end2-start)/60)>10 and ((end2-start)/60) <10.10:
            #     print(str(file))
            if str(file) != 'stop_words.txt':
                ReadFile.takeDocsInfoFromOneFile(str(pathlib.PurePath(root, file)))
                dic_of_one_file = Parser.parse(dic_to_parse)
                sorted_dictionary = collections.OrderedDict(sorted(dic_of_one_file.items()))
                index_start = time.time()
                Indexer.merge_dictionaries(sorted_dictionary)
                dic_to_parse.clear()
                counter += 1
            if counter == 100:
                Indexer.SaveAndMergePostings()
                counter = 0
    Indexer.SaveAndMergePostings()
    saveCityDictionaryToDisk(ip)
    saveMainDictionaryToDisk(ip)
    cleanDocsYeshuyot()
    saveDocumentDictionaryToDisk(ip)
    x= ReadFile.docs_dictionary
    saveLangListToDisk(ip)
    saveStopWordsDictToDisk(ip)

    #ranker things
    createAndSaveAvdlToDisk(ip)

    x=ReadFile.lang_list
    end2 = time.time()
    time_final = str((end2 - start) / 60)
    print("time of program: " + time_final)
    sendInfoToGUI(time_final)
コード例 #31
0
def main():
    global stop_words
    stop_words = Indexer.read_stop_words()
    print "Parsing the corpus"
    parse_corpus()
    print "creating the index"
    build()
    print "Parsing the queries"
    queries = parse_queries()
    Retrieval.write_queries_to_file("stopped-queries", queries)
    query_id = 1
    for query in queries:
        tf_idf(index_dict, number_of_terms1, query, query_id)
        query_id = query_id + 1

    query_id = 1
    for query in queries:
        BM25(index_dict, query, query_id)
        query_id = query_id + 1
コード例 #32
0
ファイル: gui_support.py プロジェクト: sankerr/Search-engine
            def build_engine():
                global stemmer
                global dist_path
                global corpus_path
                global root
                global w
                global in_process
                global term_dictionary_in_corpus

                reset_lists_on_screen_language()
                reset_lists_on_screen_city()
                reset_lists_on_screen_query()
                total = datetime.datetime.now()
                term_dictionary = {}
                num_of_post = Indexer.start_read(corpus_path, dist_path,
                                                 term_dictionary, stemmer)
                Indexer.extract_entity(term_dictionary, dist_path, num_of_post)
                num_of_doc = Indexer.merge_document_dictionary(
                    dist_path, num_of_post, stemmer)
                t1 = threading.Thread(target=fill_language_list)
                t1.start()

                t2 = threading.Thread(target=fill_city_list)
                t2.start()

                Indexer.new_merge(dist_path, term_dictionary, num_of_post,
                                  stemmer)
                sort_key = sorted(term_dictionary)
                sort_term_dictionary = {}
                for key in sort_key:
                    sort_term_dictionary[key] = term_dictionary[key]
                if stemmer:
                    file = open(dist_path + "/term_dictionary_stemmer.pkl",
                                "wb+")
                else:
                    file = open(dist_path + "/term_dictionary.pkl", "wb+")
                pickle.dump(sort_term_dictionary, file,
                            pickle.HIGHEST_PROTOCOL)
                term_dictionary_in_corpus = sort_term_dictionary
                file.close()
                num_of_terms = len(sort_term_dictionary)
                total_time = str(datetime.datetime.now() - total)
                showinfo(
                    'info', 'The number of documents in corpus: ' +
                    str(num_of_doc) + '\r\n' +
                    'The number of terms in the corpus: ' + str(num_of_terms) +
                    '\r\n' + 'The total time: ' + str(total_time))
                bord_state('normal')
コード例 #33
0
def main():
    global number_of_terms1
    global index_dict

    print "Select 1 for BM25"
    print "Select 2 for tf-idf"
    print "select 3 for query likelihood"
    print "select 4 for BM25 with stemmed corpus and quires"

    choice = input("Choose retrieval model")

    if choice == 4 or choice == 5:
        Indexer.build_stem()

    if choice != 4 and choice != 5:
        print "Parsing the corpus ... "
        Indexer.parse_corpus()

        print "Building the index ... "
        Indexer.build()

    number_of_terms1 = Indexer.number_of_terms1
    index_dict = Indexer.index_dict

    print "Reading queries from query doc ..."
    read_query_doc()
    read_query_doc_stem()

    query_id = 1
    if choice == 4:
        for query in parsed_queries_stem:
            BM25(index_dict, query, query_id)
            query_id = query_id + 1

    if choice == 5:
        for query in parsed_queries_stem:
            tf_idf(index_dict, number_of_terms1, query, query_id)
            query_id += 1

    for query in parsed_queries:
        if choice == 1:
            BM25(index_dict, query, query_id)
        elif choice == 2:
            tf_idf(index_dict, number_of_terms1, query, query_id)
        else:
            QLM(index_dict, query, query_id)
        query_id = query_id + 1
コード例 #34
0
def open_mosaic(image_set_directory, target_image, output_image):
    Indexer.run_index(image_set_directory)
    Stitcher.run_stitcher(target_image, 20, 'rgb', output_image)
コード例 #35
0
def index_creation():
    Parser.main()
    Indexer.unigram_index()
    Indexer.output_index_to_file("unigram_index")
コード例 #36
0
    with open("Outputs/" + model + ".txt", "a+") as out_file:
        sorted_scores = [
            (k, doc_scores[k])
            for k in sorted(doc_scores, key=doc_scores.get, reverse=True)
        ]
        for i in range(1, min(len(sorted_scores), 101)):
            doc, score = sorted_scores[i]
            rank += 1
            out_file.write(
                str(q_id) + " Q0 " + doc + " " + str(rank) + " " + str(score) +
                " " + model + "\n")


if __name__ == '__main__':

    Indexer.unigram_index(True)
    INVERTED_INDEX = Indexer.INVERTED_INDEX
    DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT

    models = ["StoppedIndexWithBM25", "StoppedIndexWithTFIDF"]
    for model in models:
        OUTPUT_FILE = "Outputs/" + model + ".txt"
        if os.path.exists(OUTPUT_FILE):
            os.remove(OUTPUT_FILE)

    query_file = open("cacm.query.txt", 'r')
    queries = []
    query = ""
    for line in query_file.readlines():
        if line == "\n":
            continue
コード例 #37
0
sys.setdefaultencoding('utf-8')

def to_url(request):
    s = request.GET.get('text')
    s = s.split(" ")
    r = []
    for i in s:
        if i != " " and i !="":
            r.append(i)
    a= Crawler.Crawler(0, 1, r, '/home/katrin/databasetemp/')
    try:
        a.downloadPages()
    except Exception, e:
        pass
    try:
        d.start('/home/katrin/databasetemp/')
    except Exception, e:
        print e
    return HttpResponse('Done it')


def index_url(request):
    return render(request, 'search/index_url.html')

def find_res(request):
    s = Search_str.Search(request.GET.get('text'))
    ans= s.start()
    return HttpResponse(json.dumps({'urls': ans}), content_type="application/json")

def urls(request):
    return render(request, 'search/urls.html')
コード例 #38
0
        sorted_scores = [
            (k, doc_scores[k])
            for k in sorted(doc_scores, key=doc_scores.get, reverse=True)
        ]
        for i in range(1, min(len(sorted_scores), 101)):
            doc, score = sorted_scores[i]
            rank += 1
            model = "PseudoRelevanceWithKL_Divergence"
            out_file.write(
                str(q_id) + " Q0 " + doc + " " + str(rank) + " " + str(score) +
                " " + model + "\n")


if __name__ == '__main__':

    Indexer.unigram_index(False)
    INVERTED_INDEX = Indexer.INVERTED_INDEX
    DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT
    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)

    query_file = open("cacm.query.txt", 'r')
    queries = []
    query = ""
    for line in query_file.readlines():
        if line == "\n":
            continue
        if line.startswith("<DOCNO>") or line.startswith("<DOC>"):
            continue
        if line.startswith("</DOC>"):
            queries.append(query.strip().lower())
コード例 #39
0
            if count % 3 == 0:
                self.id = content
                count = 0

            count = count + 1

        elif self.CurrentData == "text":
            self.text += content


# SAX Parser : As the xml file size is large
def Parse(xmlFileName):
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    Handler = WikiHandler()
    parser.setContentHandler(Handler)
    parser.parse(xmlFileName)


if (__name__ == "__main__"):
    if len(sys.argv) < 3:
        sys.stderr.write("Usage: " + sys.argv[0] +
                         " <test_dump.xml> <test_index>\n")
        sys.exit(2)
    xmlFileName = sys.argv[1]
    Parse(xmlFileName)
    #print final
    Indexer.indexing(final, sys.argv[2])
    print "time taken for script to run is ", datetime.now(
    ) - startTime  #time taken by the script
コード例 #40
0
           
	    elif self.CurrentData == "id":
		global count                
		if count%3 == 0:
                    self.id = content
		    count = 0
                    
                count = count + 1
            
            elif self.CurrentData == "text":
                self.text += content

# SAX Parser : As the xml file size is large
def Parse(xmlFileName):
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    Handler = WikiHandler()
    parser.setContentHandler( Handler )
    parser.parse(xmlFileName)


if ( __name__ == "__main__"):
    if len(sys.argv) < 3:
		sys.stderr.write("Usage: " + sys.argv[0] + " <test_dump.xml> <test_index>\n")
		sys.exit(2)
    xmlFileName = sys.argv[1]
    Parse(xmlFileName)
    #print final
    Indexer.indexing(final,sys.argv[2])
    print "time taken for script to run is " ,datetime.now() - startTime    #time taken by the script
コード例 #41
0
from shutil import copyfileobj
import bz2
import time

import Indexer
from WikiSAXHandler import WikiContentHandler
import TokenStemmer

script, infile, outfile = argv

toPrintProfile = ""

start = int(round(time.time() * 1000))

Indexer.doInit(outfile)

profileStart = int(round(time.time() * 1000))
if infile.endswith(".bz2"):
    with bz2.BZ2File(infile, 'rb', compresslevel=9) as compressed_infile:
        sax.parse(compressed_infile, WikiContentHandler(outfile))
else:
    sax.parse(infile, WikiContentHandler(outfile))
profileEnd = int(round(time.time() * 1000))
toPrintProfile += "sax.parse completed. Time Taken in milliseconds = {0}\n".format(
    (profileEnd - profileStart))

toPrintProfile += "Index.profileTime in milliseconds = {0}\n".format(
    Indexer.profileTime)
toPrintProfile += "Index.profileTime1 in milliseconds = {0}\n".format(
    Indexer.profileTime1)
コード例 #42
0
import xml.sax as sax
import re
from StringIO import StringIO
from sys import argv

from shutil import copyfileobj
import bz2

import Indexer
from WikiSAXHandler import WikiContentHandler
import TokenStemmer

script, infile, outfile = argv

Indexer.doInit(outfile)

if infile.endswith(".bz2"):
    with bz2.BZ2File(infile, 'rb', compresslevel=9) as compressed_infile:
        sax.parse(compressed_infile, WikiContentHandler(outfile))
else:
    sax.parse(infile, WikiContentHandler(outfile))

Indexer.linearWriter(outfile)

Indexer.linearMerger(outfile)

Indexer.writeIndexPartFiles(outfile)

Indexer.writeTitlePartFiles(outfile)