def endElement(self, name):
     if name == "page":
         self.current_article.processArticle()    #### This is where the processing for the article starts........starts
         Indexer.doIndex(self.current_article, self.outfile)
         
     to_store = ""
     if len(self.current_lines):
         try:
             to_store = (''.join(self.current_lines)).strip()
         except Exception:
             print "exception occured at id={0} and title={1}\n {2}".format(self.current_article.id,self.current_article.title,Exception)
     if name == "title":
         self.current_article.title = to_store
     if name == "id":
         if self.parent_element == "page":
             self.current_article.id = to_store
     if name == "text":
         self.current_article.text = to_store
     self.elements.pop()
     if self.elements:
         self.current_element = self.parent_element
         if len(self.elements) == 1:
             self.parent_element = ""
         else:
             self.parent_element = self.elements[len(self.elements)-1]
     else:
         self.current_element = ""
    def endElement(self, name):
        if name == "page":
            self.current_article.processArticle(
            )  #### This is where the processing for the article starts........starts
            Indexer.doIndex(self.current_article, self.outfile)

        to_store = ""
        if len(self.current_lines):
            try:
                to_store = (''.join(self.current_lines)).strip()
            except Exception:
                print "exception occured at id={0} and title={1}\n {2}".format(
                    self.current_article.id, self.current_article.title,
                    Exception)
        #to_store = self.current_characters.strip()
        if name == "title":
            self.current_article.title = to_store
        if name == "id":
            if self.parent_element == "page":
                self.current_article.id = to_store
        if name == "text":
            self.current_article.text = to_store
        self.elements.pop()
        if self.elements:
            self.current_element = self.parent_element
            if len(self.elements) == 1:
                self.parent_element = ""
            else:
                self.parent_element = self.elements[len(self.elements) - 1]
        else:
            self.current_element = ""
Example #3
0
def main():

    print("--- RETRIEVER ---")
    print("Select")
    print("1 for BM25")
    print("2 for tf-idf")
    print("3 for Query Likelihood Model")
    user_choice = input("Enter your choice: ")
    if user_choice not in ["1", "2", "3"]:
        print("\nInvalid input. Aborting . .")
        sys.exit()

    
    # sets "RETRIEVAL_MODEL" to the user chosen model.
    set_retrieval_model(user_choice)
    
    # Create a directory to save the results.  
    # and overwrite existing run file, if any 
    os.makedirs(RUN_OUTPUTS_DIR,exist_ok=True)
    global RETRIEVAL_MODEL
    output_file = os.path.join(RUN_OUTPUTS_DIR,RETRIEVAL_MODEL+"Run.txt")
    if os.path.exists(output_file):
        os.remove(output_file)
     
    # Generate the unigram index.
    # By default, not performing stopping.
    # So send False
    Indexer.unigram_index(False)    

    # Fetch the index generated.
    global INVERTED_INDEX
    INVERTED_INDEX = Indexer.INVERTED_INDEX
    global DOC_TOKEN_COUNT
    DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT
      

    # Read all queries. 
    queries = extract_queries_from_file()   
    
    global QUERY_ID
    for query in queries:
        QUERY_ID +=1       
        
        # Dictionary of query term frequency
        query_term_freq = query_term_freq_map(query)
        
        # Fetch the inverted indexes corresponding to the terms
        # in the query.
        fetched_index = query_matching_index(query_term_freq)
                
        # Compute ranking scores of all docs for this query.
        doc_scores = compute_doc_scores(fetched_index, query_term_freq)
        
        # Write results to a textfile.
        output_to_file(doc_scores,QUERY_ID)        

        print("Completed Retrieval for query : " + query)

    print("End of Retrieval.")
Example #4
0
def Get_UL_Coord(contour,pad=10):
    """
    Get the upper left coordinate of the contour.
    """
    dists = []
    for c in contour:
        dists.append(distbetween(c[0][0],c[0][1],0,0))

    return (contour[Indexer.get_index_of_min(dists)[0]][0][0]-pad,contour[Indexer.get_index_of_min(dists)[0]][0][1]-pad)
Example #5
0
def data_set_Path(corpus_path, index_path):
    global __stopwords_path
    global __corpus_path
    global __index_path
    __stopwords_path = corpus_path + "\\stop_words.txt"
    Parser.set_stop_words_file(__stopwords_path)
    __corpus_path = corpus_path
    __index_path = index_path
    Indexer.set_path_to_postiong_files(__index_path)
Example #6
0
def Get_LR_Coord(contour,imgXmax,imgYmax, pad=10):    
    """
    Get the lower right coordinate of the contour.
    """
    dists = []
    for c in contour:
        dists.append(distbetween(c[0][0],c[0][1],imgXmax,imgYmax))

    return (contour[Indexer.get_index_of_min(dists)[0]][0][0]+pad,contour[Indexer.get_index_of_min(dists)[0]][0][1]+pad)
Example #7
0
def Get_UL_Coord(contour, pad=10):
    """
    Get the upper left coordinate of the contour.
    """
    dists = []
    for c in contour:
        dists.append(distbetween(c[0][0], c[0][1], 0, 0))

    return (contour[Indexer.get_index_of_min(dists)[0]][0][0] - pad,
            contour[Indexer.get_index_of_min(dists)[0]][0][1] - pad)
Example #8
0
def Get_LR_Coord(contour, imgXmax, imgYmax, pad=10):
    """
    Get the lower right coordinate of the contour.
    """
    dists = []
    for c in contour:
        dists.append(distbetween(c[0][0], c[0][1], imgXmax, imgYmax))

    return (contour[Indexer.get_index_of_min(dists)[0]][0][0] + pad,
            contour[Indexer.get_index_of_min(dists)[0]][0][1] + pad)
Example #9
0
def main():

    print("--- RETRIEVAL WITH SNIPPETS ---")
   
    # Generate the unigram index.
    # By default, not performing stopping.
    # So send False
    Indexer.unigram_index(False)    

    # Fetch the index generated.
    global INVERTED_INDEX
    INVERTED_INDEX = Indexer.INVERTED_INDEX
    global DOC_TOKEN_COUNT
    DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT
       

    # Read all queries. 
    queries = extract_queries_from_file()   
    

    global QUERY_ID
    for query in queries:
        QUERY_ID +=1       
        
        # Dictionary of query term frequency
        query_term_freq = query_term_freq_map(query)
        
        # Fetch the inverted indexes corresponding to the terms
        # in the query.
        fetched_index = query_matching_index(query_term_freq)

        # Compute BM25 score for this query.
        doc_scores = BM25_score(fetched_index, query_term_freq)
        
        # Extract top 100 docs, for which we need
        # to generate snippets.
        sorted_scores = [(k, doc_scores[k]) for k in sorted(doc_scores, key=doc_scores.get, reverse = True)]        
        for i in range(min(len(sorted_scores),100)):
            k,v = sorted_scores[i]
            doc_list.append(k)
        
        
        ''' top100 = Counter(doc_scores).most_common(100)
        doc_list = []
        for k,v in top100: 
            doc_list.append(k) '''
        
        # Snippet generator prints snippets 
        # with query term highlighting, onto the console.
        SnippetGenerator.snippet_generator(doc_list,query, INVERTED_INDEX)
               

        print("Completed Retrieval for query : " + query)

    print("End of Retrieval with Snippets.")
Example #10
0
def index_retri():
    Indexer.parse_corpus()
    Indexer.build()
    number_of_terms1 = Indexer.number_of_terms1
    index_dict = Indexer.index_dict
    parsed_queries = Retrieval.read_query_doc()

    query_id = 1
    for query in parsed_queries:
        Retrieval.tf_idf(index_dict, number_of_terms1, query, query_id)
        query_id += 1
Example #11
0
def reset(param=None):
    global __corpus_path,__index_path, __stem_suffix
    if param == "Queries":
        Parser.reset()
        __corpus_path = ""
    else:
        ReadFile.reset()
        Parser.reset()
        Indexer.reset()
        remove_index_files()
        __stem_suffix = ''
        __corpus_path = ""
        __index_path = ""
Example #12
0
def task3a(model, raw_corpus_directory):
    project_directory = os.getcwd()
    p = Indexer.Parser()
    corpus_directory = p.build_corpus(raw_corpus_directory, stopped=True)
    output_directory = os.path.join(project_directory, "output")

    I = Indexer.InvertedIndexer(corpus_directory)
    I.ngram_indexer(1)  # builds a unigram indexes for each word
    r = Retriever.Retriever(
        corpus_directory, I, project_directory
    )  # create a Retriever class, which contains different retrieval model

    os.chdir(raw_corpus_directory)
    os.chdir(os.pardir)
    f = open('cacm.query.txt', 'r')
    soup = BeautifulSoup(f.read(), 'html.parser')
    f.close()

    f_stop_words = open('common_words.txt', 'r')
    stop_words_list = f_stop_words.readlines()
    stop_words = [i.strip() for i in stop_words_list]
    f_stop_words.close()
    file_name = os.path.join(output_directory, 'task3a_' + model + '.txt')
    f = open(file_name, 'w')  # open file for writing results
    for i in range(64):
        query_no = (soup.find('docno')).text.encode(
            'utf-8')  # extract query number and query
        (soup.find('docno')).decompose()
        query = (soup.find('doc')).text.encode('utf-8')
        (soup.find('doc')).decompose()

        r.process_query(query, stopped=True,
                        stopwords=stop_words)  # parse the query
        # r.clean_content(query)
        docs_and_scores = r.get_scores_for_docs(
            model, int(query_no))  # retrieve relevant documents

        # save results into appropriate file
        docs = docs_and_scores[0]
        scores = docs_and_scores[1]
        for i in range(100):
            f.write(str(query_no) \
                        + " Q0 " \
                        + str(docs[i]) + ' ' \
                        + str((i+1)) + " " \
                        + str(scores[i]) + " " \
                        + model + "\n")
    f.close()
Example #13
0
def main():

    print("--- PSEUDO RELEVANCE RETRIEVER ---")

    # Create a directory to save the results.  
    # and overwrite existing run file, if any 
    os.makedirs(RUN_OUTPUTS_DIR,exist_ok=True)    
    output_file = os.path.join(RUN_OUTPUTS_DIR,"PseudoRelBM25Run.txt")
    if os.path.exists(output_file):
        os.remove(output_file)

    # Generate the unigram index.
    # By default, not performing stopping.
    # So send False
    Indexer.unigram_index(False)    

    # Fetch the index generated.
    global INVERTED_INDEX
    INVERTED_INDEX = Indexer.INVERTED_INDEX
    global DOC_TOKEN_COUNT
    DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT

    # Read all queries. 
    queries = extract_queries_from_file()   

    global QUERY_ID
    for query in queries:
        QUERY_ID +=1       
        
        # Dictionary of query term frequency
        query_term_freq = query_term_freq_map(query)
        
        # Fetch the inverted indexes corresponding to the terms
        # in the query.
        fetched_index = query_matching_index(query_term_freq)
                
        # Compute BM25 score for this query term.
        doc_scores = BM25_score(fetched_index, query_term_freq)

        # Update doc scores using Pseudo-relevance
        doc_scores = pseudo_relevance(query,doc_scores)
        
        # Write results to a textfile.
        output_to_file(doc_scores,QUERY_ID)        

        print("Completed Retrieval for query : " + query)

    print("End of Pseudo Relevance retrieval.")
Example #14
0
    def findSimilar(self,link,limit):

        # we call the read text function from the Crawler to read the new link
        # we use the construcotr with empty variables
        crawler = Crawler.Crawler('',0,0,0)


        self.limit = limit
        file = open("Data/page%d.txt" % self.limit, 'w')

        try:
            self.title , text = crawler.getText(link)
            # we combine the lists of string to a single string
            text = ''.join(text)
            for t in text:
                file.write(t)
            file.close()
        except:
            "Link is not accesible"
            file.close()
            sys.exit(0)

        indexer = Indexer.Indexer()
        indexer.start()

        cosineSimilarity = indexer.getCosineSimilarity()



        linksId = [ i for i in range(self.limit)]

        linksIdSorted = [x for _,x in sorted(zip(cosineSimilarity,linksId),reverse=True)]

        return cosineSimilarity , linksIdSorted
Example #15
0
 def __init__(self):
     # initialize the Controller object, first we would need to initialize
     # the configuration system
     # the configuration is saved in the appdata system
     self.configuration = Configuration.Configuration()
     self.crawler = Indexer.FileCrawler()
     self.sep = os.sep
Example #16
0
def task3b(model):
    output_directory = os.path.join(os.getcwd(), "output")
    stemmed_corpus = get_stemmed_corpus()
    f = open('cacm_stem.query.txt', 'r')
    stemmed_queries = f.readlines()
    f.close()

    I = Indexer.InvertedIndexer('')
    I.stemmed_indexer(stemmed_corpus)
    r = Retriever.Retriever('', I, os.getcwd())
    file_name = os.path.join(output_directory,
                             'task3b_' + model + '_stemmed.txt')
    f = open(file_name, 'w')
    query_no = [12, 13, 19, 23, 24, 25, 50]
    q_iter = 0
    for each_query in stemmed_queries:
        r.process_query(each_query)
        docs_and_scores = r.get_scores_for_docs(model, query_no[q_iter])

        # save results into appropriate file
        docs = docs_and_scores[0]
        scores = docs_and_scores[1]
        for i in range(100):
            f.write(str(query_no[q_iter]) \
                    + " Q0 " \
                    + str(docs[i]) + ' ' \
                    + str((i + 1)) + " " \
                    + str(scores[i]) + " " \
                    + model + "\n")
        q_iter += 1
    f.close()
Example #17
0
File: nSyn.py Project: neo1989/nSyn
def localIndexer():
    d = eval('local("tree -Nif %s",True)' % LOCAL_DIR)
    filelist = d.split('\n')
    for path in filelist[1:-2]:
        path = _strEncode(path)
        index = path.replace(r'%s' % LOCAL_DIR,'')
        r = eval('local("test -d %s || md5sum %s",True)' % (path,path))
        if not r:
            md5sum = 0
            lastModification = 0
            isDir = 1
        else :
            md5sum = r.split('  ')[0] 
            lastModification = int(eval('local("stat -c %Y '+ path +'",True)'))
            isDir = 0

        Indexer.addLocal(index,md5sum,lastModification,isDir)
Example #18
0
File: nSyn.py Project: neo1989/nSyn
def remoteIndexer():
    d = eval('run("tree -Nif %s")' % REMOTE_DIR)
    filelist = d.split('\r\n')
    for path in filelist[1:-2]:
        path = _strEncode(path)
        index = path.replace(r'%s' % REMOTE_DIR,'') 
        r = eval('run("test -d %s || md5sum %s")' % (path,path))
        if not r:
            md5sum = 0
            lastModification = 0
            isDir = 1
        else :
            md5sum = r.split('  ')[0] 
            lastModification = int(eval('run("stat -c %Y '+ path +'")'))
            isDir = 0
        
        Indexer.addRemote(index,md5sum,lastModification,isDir)
Example #19
0
def main():
    global stop_words
    global parsed_queries

    Indexer.read_stop_words()
    Retrieval.read_query_doc()
    # generate the unigrams from the lucene results for k=10 or 20
    stop_words = Indexer.stop_words
    parsed_queries = Retrieval.parsed_queries

    # evaluating the dice coefficient on the search query with n=8 or 6
    query_id = 0
    for query in parsed_queries:
        query_id = query_id + 1
        result = generateUnigramDocFreq(stop_words, query_id)
        runSearchQuery(query.lower(), result, 8)

    Retrieval.write_queries_to_file("expanded-queries", expanded_queries)
def read_query_doc():
    global parsed_queries
    fr = open('test-collection/cacm.query.txt', 'r')
    queries = fr.read()
    while queries.find('<DOC>') != -1:
        query = queries[queries.find('</DOCNO>') + 8:queries.find('</DOC>')]
        query = query.strip().replace('\n', " ")
        new_query = Indexer.transformText(query, True, True)
        parsed_queries.append(new_query)
        queries = queries[queries.find('</DOC>') + 6:]
    write_queries_to_file("parsed-queries", parsed_queries)
Example #21
0
def GetClothColor(hsv,search_width=45):
    """
    Find the most common HSV values in the image.
    In a well lit image, this will be the cloth
    """

    hist = cv2.calcHist([hsv], [1], None, [180], [0, 180])
    h_max = Indexer.get_index_of_max(hist)[0]
    
    hist = cv2.calcHist([hsv], [1], None, [256], [0, 256])
    s_max = Indexer.get_index_of_max(hist)[0]
    
    hist = cv2.calcHist([hsv], [2], None, [256], [0, 256])
    v_max = Indexer.get_index_of_max(hist)[0]

    # define range of blue color in HSV
    lower_color = np.array([h_max-search_width,s_max-search_width,v_max-search_width])
    upper_color = np.array([h_max+search_width,s_max+search_width,v_max+search_width])

    return lower_color, upper_color
Example #22
0
def run_index():
    # run an entire index build
    global docs_path
    global postings_path
    global is_stemming
    global indexer
    global dict_cache_path
    try:
        # check validation conditions
        if (not check_corpus_directory(docs_path.get())) or (not check_postings_directory(postings_path.get())):
            return
        result = tkMessageBox.askquestion("Run Index",
                                          "Are you sure?\n dont worry if the GUI"
                                          " is stuck or not responding - it is working", icon='warning')
        if result != 'yes':
            return
        print ('START TIME - ' + time.strftime("%H:%M:%S"))
        start_time = datetime.now()
        # reset the current memory of the project
        if (globs.main_dictionary is not None) and (bool(globs.main_dictionary)):
            globs.main_dictionary.clear()
        if (globs.cache is not None) and (bool(globs.cache)):
            globs.cache.clear()
        if (globs.documents_dict is not None) and (bool(globs.documents_dict)):
            globs.documents_dict.clear()
        # start indexing
        globs.stop_words = load_stop_words(docs_path.get())
        indexer = Indexer.Indexer(postings_path.get(), is_stemming.get())
        read_file = ReadFile.ReadFile(get_corpus_dir(docs_path.get()),
                                      indexer, globs.constants, globs.stop_words, is_stemming.get())
        read_file.index_folder()
        globs.num_of_documents = len(read_file.documents_dict)

        globs.documents_dict = read_file.documents_dict
        del read_file
        indexer.unite_temp_postings()
        globs.main_dictionary = indexer.main_dict
        indexer.build_document_weight(globs.documents_dict)
        # in case want to print stats, uncomment this
        # with open('{}{}'.format('stats', 'stem' if is_stemming.get() else ''),'w') as my_stats_file:
        #    my_stats_file.write('term,tf,df\n'.format())
        #    for key,val in main_dictionary.iteritems():
        #        my_stats_file.write('{},{},{}\n'.format(key,val.tf,val.df))
        globs.cache = indexer.cache_dict
        globs.average_doc_size = globs.average_doc_size/globs.num_of_documents
        dict_cache_path = postings_path
        print ('END TIME - ' + time.strftime("%H:%M:%S"))
        end_time = datetime.now()
        print_stats_at_end_of_indexing(end_time - start_time)
    except Exception as err:
        tkMessageBox.showinfo('ERROR', err)
        traceback.print_exc(file=stdout)
Example #23
0
def MaskTableBed(contours):
    """
    Mask out the table bed, assuming that it will be the biggest contour.
    """

    #The largest area should be the table bed
    areas = []
    for c in contours:
        areas.append(cv2.contourArea(c))

    #return the contour that delineates the table bed
    largest_contour = Indexer.get_index_of_max(areas)
    return contours[largest_contour[0]]
Example #24
0
def MaskTableBed(contours):
    """
    Mask out the table bed, assuming that it will be the biggest contour.
    """
            
    #The largest area should be the table bed    
    areas = []    
    for c in contours:
        areas.append(cv2.contourArea(c))
    
    #return the contour that delineates the table bed
    largest_contour = Indexer.get_index_of_max(areas)
    return contours[largest_contour[0]]
Example #25
0
def change_index(request):
    ans = URL.objects.all()
    cur_page = []
    for i in ans:
        cur_page.append((str(i.link), str(i.description)))
    if request.GET.get('request') == 'page':
        num_page = int(request.GET.get('num'))
        return HttpResponse(json.dumps({'urls': cur_page[5 * (num_page - 1): num_page * 5]}), content_type="application/json")
    elif request.GET.get('request') == 'save':
        print "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
        name = base64.b16encode(request.GET.get('url'))
        f = open('/home/katrin/databasetemp/'+name, "w")
        f.write(request.GET.get('txt'))

        try:
            d.start('/home/katrin/databasetemp/')
        except Exception, e:
            print e
        URL.objects.filter(link=request.GET.get('url')).delete()
        new_link = URL(link=request.GET.get('url'), description=request.GET.get('txt'))
        new_link.save()
        return  HttpResponse(json.dumps('ok'), content_type="application/json")
Example #26
0
def GetClothColor(hsv, search_width=45):
    """
    Find the most common HSV values in the image.
    In a well lit image, this will be the cloth
    """

    hist = cv2.calcHist([hsv], [1], None, [180], [0, 180])
    h_max = Indexer.get_index_of_max(hist)[0]

    hist = cv2.calcHist([hsv], [1], None, [256], [0, 256])
    s_max = Indexer.get_index_of_max(hist)[0]

    hist = cv2.calcHist([hsv], [2], None, [256], [0, 256])
    v_max = Indexer.get_index_of_max(hist)[0]

    # define range of blue color in HSV
    lower_color = np.array(
        [h_max - search_width, s_max - search_width, v_max - search_width])
    upper_color = np.array(
        [h_max + search_width, s_max + search_width, v_max + search_width])

    return lower_color, upper_color
def checkInIndexFileWordMap(term):
    pos = bisect.bisect(sortedIndexFileWordMapKeys,term)
    if pos > 0:
        pos = pos - 1
    key = sortedIndexFileWordMapKeys[pos]
    index = indexFileWordMap[key]
    with bz2.BZ2File("{0}.index{1}.bz2".format(infile,index), 'rb', compresslevel=9) as ipartF:
        for line in ipartF:
            if line.startswith("{0}=".format(term)):
                parts = line.strip().split("=")
                if len(parts) == 2:
                    ffo = Indexer.getFOFromLine(parts[1])
                    return ffo
    return {}
Example #28
0
File: nSyn.py Project: neo1989/nSyn
def parallel():
    remote_indexes = Indexer.allRemote()
    for r in remote_indexes:
        l = Indexer.oneLocal(r.index)
        if not l:
            if r.isDir == 1:
                local_dir_to_create.append(r.index)
            else:
                files_from_remote.append(r.index)
        else:
            if r.isDir == 0 and r.md5sum != l.md5sum and r.lastModification > l.lastModification:
                files_from_remote.append(r.index)

    local_indexes = Indexer.allLocal()
    for l in local_indexes:
        r = Indexer.oneRemote(l.index)
        if not r:
            if l.isDir == 1:
                remote_dir_to_create.append(l.index)
            else:
                files_from_local.append(l.index)
        else:
            if l.isDir == 0 and l.md5sum != r.md5sum and l.lastModification > r.lastModification:
                files_from_local.append(l.index)
def checkInIndexFileWordMap(term):
    pos = bisect.bisect(sortedIndexFileWordMapKeys,term)
    if pos > 0:
        pos = pos - 1
    key = sortedIndexFileWordMapKeys[pos]
    index = indexFileWordMap[key]
    #print "key = {0} and index = {1}".format(key,index)
    with bz2.BZ2File("{0}.index{1}.bz2".format(infile,index), 'rb', compresslevel=9) as ipartF:
        #print "checking file {0}.index{1}.bz2".format(infile,index)
        for line in ipartF:
            if line.startswith("{0}=".format(term)):
                parts = line.strip().split("=")
                if len(parts) == 2:
                    #word = parts[0]
                    ffo = Indexer.getFOFromLine(parts[1])
                    return ffo
    return {}
Example #30
0
def Main(cp, ip, to_stem):
    global __corpus_path
    global __index_path
    global doc
    global __stem_suffix
    create_city_db()
    Parser.stem = to_stem
    if to_stem is True:
        __stem_suffix = '_stem'
    else:
        __stem_suffix = ''

    start = time.time()
    data_set_Path(cp, ip)
    Indexer.create_posting_files(__stem_suffix)
    counter = 0
    for root, dirs, files in os.walk(__corpus_path):
        for file in files:
            if (stop==True):
                reset() #will clear the memory of the program %% will remove the posting files and dictionary
                return
            #print("file!!!")
            end2 = time.time()
            # if ((end2-start)/60)>10 and ((end2-start)/60) <10.10:
            #     print(str(file))
            if str(file) != 'stop_words.txt':
                ReadFile.takeDocsInfoFromOneFile(str(pathlib.PurePath(root, file)))
                dic_of_one_file = Parser.parse(dic_to_parse)
                sorted_dictionary = collections.OrderedDict(sorted(dic_of_one_file.items()))
                index_start = time.time()
                Indexer.merge_dictionaries(sorted_dictionary)
                dic_to_parse.clear()
                counter += 1
            if counter == 100:
                Indexer.SaveAndMergePostings()
                counter = 0
    Indexer.SaveAndMergePostings()
    saveCityDictionaryToDisk(ip)
    saveMainDictionaryToDisk(ip)
    cleanDocsYeshuyot()
    saveDocumentDictionaryToDisk(ip)
    x= ReadFile.docs_dictionary
    saveLangListToDisk(ip)
    saveStopWordsDictToDisk(ip)

    #ranker things
    createAndSaveAvdlToDisk(ip)

    x=ReadFile.lang_list
    end2 = time.time()
    time_final = str((end2 - start) / 60)
    print("time of program: " + time_final)
    sendInfoToGUI(time_final)
Example #31
0
def main():
    global stop_words
    stop_words = Indexer.read_stop_words()
    print "Parsing the corpus"
    parse_corpus()
    print "creating the index"
    build()
    print "Parsing the queries"
    queries = parse_queries()
    Retrieval.write_queries_to_file("stopped-queries", queries)
    query_id = 1
    for query in queries:
        tf_idf(index_dict, number_of_terms1, query, query_id)
        query_id = query_id + 1

    query_id = 1
    for query in queries:
        BM25(index_dict, query, query_id)
        query_id = query_id + 1
Example #32
0
            def build_engine():
                global stemmer
                global dist_path
                global corpus_path
                global root
                global w
                global in_process
                global term_dictionary_in_corpus

                reset_lists_on_screen_language()
                reset_lists_on_screen_city()
                reset_lists_on_screen_query()
                total = datetime.datetime.now()
                term_dictionary = {}
                num_of_post = Indexer.start_read(corpus_path, dist_path,
                                                 term_dictionary, stemmer)
                Indexer.extract_entity(term_dictionary, dist_path, num_of_post)
                num_of_doc = Indexer.merge_document_dictionary(
                    dist_path, num_of_post, stemmer)
                t1 = threading.Thread(target=fill_language_list)
                t1.start()

                t2 = threading.Thread(target=fill_city_list)
                t2.start()

                Indexer.new_merge(dist_path, term_dictionary, num_of_post,
                                  stemmer)
                sort_key = sorted(term_dictionary)
                sort_term_dictionary = {}
                for key in sort_key:
                    sort_term_dictionary[key] = term_dictionary[key]
                if stemmer:
                    file = open(dist_path + "/term_dictionary_stemmer.pkl",
                                "wb+")
                else:
                    file = open(dist_path + "/term_dictionary.pkl", "wb+")
                pickle.dump(sort_term_dictionary, file,
                            pickle.HIGHEST_PROTOCOL)
                term_dictionary_in_corpus = sort_term_dictionary
                file.close()
                num_of_terms = len(sort_term_dictionary)
                total_time = str(datetime.datetime.now() - total)
                showinfo(
                    'info', 'The number of documents in corpus: ' +
                    str(num_of_doc) + '\r\n' +
                    'The number of terms in the corpus: ' + str(num_of_terms) +
                    '\r\n' + 'The total time: ' + str(total_time))
                bord_state('normal')
def main():
    global number_of_terms1
    global index_dict

    print "Select 1 for BM25"
    print "Select 2 for tf-idf"
    print "select 3 for query likelihood"
    print "select 4 for BM25 with stemmed corpus and quires"

    choice = input("Choose retrieval model")

    if choice == 4 or choice == 5:
        Indexer.build_stem()

    if choice != 4 and choice != 5:
        print "Parsing the corpus ... "
        Indexer.parse_corpus()

        print "Building the index ... "
        Indexer.build()

    number_of_terms1 = Indexer.number_of_terms1
    index_dict = Indexer.index_dict

    print "Reading queries from query doc ..."
    read_query_doc()
    read_query_doc_stem()

    query_id = 1
    if choice == 4:
        for query in parsed_queries_stem:
            BM25(index_dict, query, query_id)
            query_id = query_id + 1

    if choice == 5:
        for query in parsed_queries_stem:
            tf_idf(index_dict, number_of_terms1, query, query_id)
            query_id += 1

    for query in parsed_queries:
        if choice == 1:
            BM25(index_dict, query, query_id)
        elif choice == 2:
            tf_idf(index_dict, number_of_terms1, query, query_id)
        else:
            QLM(index_dict, query, query_id)
        query_id = query_id + 1
def open_mosaic(image_set_directory, target_image, output_image):
    Indexer.run_index(image_set_directory)
    Stitcher.run_stitcher(target_image, 20, 'rgb', output_image)
Example #35
0
def index_creation():
    Parser.main()
    Indexer.unigram_index()
    Indexer.output_index_to_file("unigram_index")
    with open("Outputs/" + model + ".txt", "a+") as out_file:
        sorted_scores = [
            (k, doc_scores[k])
            for k in sorted(doc_scores, key=doc_scores.get, reverse=True)
        ]
        for i in range(1, min(len(sorted_scores), 101)):
            doc, score = sorted_scores[i]
            rank += 1
            out_file.write(
                str(q_id) + " Q0 " + doc + " " + str(rank) + " " + str(score) +
                " " + model + "\n")


if __name__ == '__main__':

    Indexer.unigram_index(True)
    INVERTED_INDEX = Indexer.INVERTED_INDEX
    DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT

    models = ["StoppedIndexWithBM25", "StoppedIndexWithTFIDF"]
    for model in models:
        OUTPUT_FILE = "Outputs/" + model + ".txt"
        if os.path.exists(OUTPUT_FILE):
            os.remove(OUTPUT_FILE)

    query_file = open("cacm.query.txt", 'r')
    queries = []
    query = ""
    for line in query_file.readlines():
        if line == "\n":
            continue
Example #37
0
sys.setdefaultencoding('utf-8')

def to_url(request):
    s = request.GET.get('text')
    s = s.split(" ")
    r = []
    for i in s:
        if i != " " and i !="":
            r.append(i)
    a= Crawler.Crawler(0, 1, r, '/home/katrin/databasetemp/')
    try:
        a.downloadPages()
    except Exception, e:
        pass
    try:
        d.start('/home/katrin/databasetemp/')
    except Exception, e:
        print e
    return HttpResponse('Done it')


def index_url(request):
    return render(request, 'search/index_url.html')

def find_res(request):
    s = Search_str.Search(request.GET.get('text'))
    ans= s.start()
    return HttpResponse(json.dumps({'urls': ans}), content_type="application/json")

def urls(request):
    return render(request, 'search/urls.html')
Example #38
0
        sorted_scores = [
            (k, doc_scores[k])
            for k in sorted(doc_scores, key=doc_scores.get, reverse=True)
        ]
        for i in range(1, min(len(sorted_scores), 101)):
            doc, score = sorted_scores[i]
            rank += 1
            model = "PseudoRelevanceWithKL_Divergence"
            out_file.write(
                str(q_id) + " Q0 " + doc + " " + str(rank) + " " + str(score) +
                " " + model + "\n")


if __name__ == '__main__':

    Indexer.unigram_index(False)
    INVERTED_INDEX = Indexer.INVERTED_INDEX
    DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT
    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)

    query_file = open("cacm.query.txt", 'r')
    queries = []
    query = ""
    for line in query_file.readlines():
        if line == "\n":
            continue
        if line.startswith("<DOCNO>") or line.startswith("<DOC>"):
            continue
        if line.startswith("</DOC>"):
            queries.append(query.strip().lower())
            if count % 3 == 0:
                self.id = content
                count = 0

            count = count + 1

        elif self.CurrentData == "text":
            self.text += content


# SAX Parser : As the xml file size is large
def Parse(xmlFileName):
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    Handler = WikiHandler()
    parser.setContentHandler(Handler)
    parser.parse(xmlFileName)


if (__name__ == "__main__"):
    if len(sys.argv) < 3:
        sys.stderr.write("Usage: " + sys.argv[0] +
                         " <test_dump.xml> <test_index>\n")
        sys.exit(2)
    xmlFileName = sys.argv[1]
    Parse(xmlFileName)
    #print final
    Indexer.indexing(final, sys.argv[2])
    print "time taken for script to run is ", datetime.now(
    ) - startTime  #time taken by the script
           
	    elif self.CurrentData == "id":
		global count                
		if count%3 == 0:
                    self.id = content
		    count = 0
                    
                count = count + 1
            
            elif self.CurrentData == "text":
                self.text += content

# SAX Parser : As the xml file size is large
def Parse(xmlFileName):
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    Handler = WikiHandler()
    parser.setContentHandler( Handler )
    parser.parse(xmlFileName)


if ( __name__ == "__main__"):
    if len(sys.argv) < 3:
		sys.stderr.write("Usage: " + sys.argv[0] + " <test_dump.xml> <test_index>\n")
		sys.exit(2)
    xmlFileName = sys.argv[1]
    Parse(xmlFileName)
    #print final
    Indexer.indexing(final,sys.argv[2])
    print "time taken for script to run is " ,datetime.now() - startTime    #time taken by the script
Example #41
0
from shutil import copyfileobj
import bz2
import time

import Indexer
from WikiSAXHandler import WikiContentHandler
import TokenStemmer

script, infile, outfile = argv

toPrintProfile = ""

start = int(round(time.time() * 1000))

Indexer.doInit(outfile)

profileStart = int(round(time.time() * 1000))
if infile.endswith(".bz2"):
    with bz2.BZ2File(infile, 'rb', compresslevel=9) as compressed_infile:
        sax.parse(compressed_infile, WikiContentHandler(outfile))
else:
    sax.parse(infile, WikiContentHandler(outfile))
profileEnd = int(round(time.time() * 1000))
toPrintProfile += "sax.parse completed. Time Taken in milliseconds = {0}\n".format(
    (profileEnd - profileStart))

toPrintProfile += "Index.profileTime in milliseconds = {0}\n".format(
    Indexer.profileTime)
toPrintProfile += "Index.profileTime1 in milliseconds = {0}\n".format(
    Indexer.profileTime1)
Example #42
0
import xml.sax as sax
import re
from StringIO import StringIO
from sys import argv

from shutil import copyfileobj
import bz2

import Indexer
from WikiSAXHandler import WikiContentHandler
import TokenStemmer

script, infile, outfile = argv

Indexer.doInit(outfile)

if infile.endswith(".bz2"):
    with bz2.BZ2File(infile, 'rb', compresslevel=9) as compressed_infile:
        sax.parse(compressed_infile, WikiContentHandler(outfile))
else:
    sax.parse(infile, WikiContentHandler(outfile))

Indexer.linearWriter(outfile)

Indexer.linearMerger(outfile)

Indexer.writeIndexPartFiles(outfile)

Indexer.writeTitlePartFiles(outfile)