Example #1
0
File: index.py Project: sankosk/SIW
def main(args):
    indexer = Indexer()

    with io.open(args.texts, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            bag = BagOfWords(line, filter_stopwords=False)
            indexer.index(bag)
    open_func = gzip.open if args.zip else io.open
    index_ext = ".json.gz" if args.zip else ".json"
    with open_func(args.index + index_ext, mode="wb") as f:
        indexer.dump(f)
    return 0
Example #2
0
    def __init__(self, cluster_size=2000, dhash_size=4):
        self.cluster_size = cluster_size
        self.dhash_size = dhash_size
        self.indexer = Indexer()

        self.db_path = '../index/deeprelevance.db'
        self.temp_table = 'CREATE TEMPORARY TABLE dhash_filtered (dbid TEXT PRIMARY KEY)'
        self.temp_insert = 'INSERT INTO dhash_filtered VALUES(?)'

        self.dhash_stmt = 'SELECT dbid, dhash FROM features'
        self.dhash_bit_stmt = 'SELECT dbid, (dhash|?)&~(dhash&?) FROM features'

        self.feats_stmt = '''
Example #3
0
File: app.py Project: pawel717/PJN
def indexCorpus():
    indexer = Indexer(database)
    # index normal articles
    indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles")
    indexer.compute_tf()
    indexer.compute_tf_idf()
    indexer.purge()
    # index lemmatized articles
    indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles_lemma")
    indexer.output_catalog = "./indexes_lemmatized/"
    indexer.compute_tf()
    indexer.compute_tf_idf()
    indexer.purge()
Example #4
0
def main(search_terms):
    dbname = 'ftp_files.db'
    db = Database(dbname)

    xname = 'xapian.db'
    corpus = Indexer(xname)

    result = corpus.search(str(search_terms))
    print_results(result[0], result[1], result[2], db)

    # clean up
    corpus.close()
    db.close()
Example #5
0
 def __init__(self, readers=10, start='Main/HomePage', outfile=None):
     self.url_queue = SetQueue()
     self.url_queue.put(start)
     self.link_queue = Queue()
     self.index = Indexer()
     self.readers = [
         TVTropes_Reader(self.url_queue, self.link_queue, daemon=True)
         for _ in range(readers)
     ]
     self.counter = TVTropes_Counter(self.link_queue,
                                     self.index,
                                     outfile=outfile,
                                     daemon=True)
Example #6
0
def parse_html(url, bs):
    print 'Start parse html from url: ' + str(url)
    body = bs.find('body')
    if body is None:
        return
    raw_text = body.get_text()
    words = get_words_from_raw_text(raw_text)
    dict_words = get_dict_words(words[:100])

    # print dict_words
    print 'Start Indexing url: ' + str(url)
    indexer = Indexer(url=url, words=dict_words)
    indexer.save()
Example #7
0
def form_example():
    from videosplitter import VideoSplitter
    from indexer import Indexer
    from search import Search

    if request.method == 'POST':  # this block is only entered when the form is submitted
        dataset_path = 'video_frames'
        index_path = 'index.csv'
        query_path = 'static/defaultvalues'
        result_path = 'static/result'

        video = request.files['video']
        query_image = request.files['image']

        copyfile('static/defaultfiles/Video4_amn_cs445.mp4',
                 'static/defaultvalues/Video4_amn_cs445.mp4')
        copyfile('static/defaultfiles/query4_amn_cs445.png',
                 'static/defaultvalues/query4_amn_cs445.png')

        if (video.filename == ''):
            videofilename = 'static/defaultvalues/Video4_amn_cs445.mp4'
        else:
            # save video
            if os.path.exists(video.filename):
                os.remove(video.filename)
            video.save(os.path.join("", video.filename))
            videofilename = video.filename

        if (query_image.filename == ''):
            query_path = 'static/defaultvalues/query4_amn_cs445.png'
        else:
            # save query image
            if os.path.exists("static/defaultvalues/" + query_image.filename):
                os.remove("static/defaultvalues/" + query_image.filename)
            query_image.save(os.path.join(query_path, query_image.filename))
            query_path = os.path.join(query_path, query_image.filename)

        videoSplitter = VideoSplitter('video_frames')
        videoSplitter.splitVideo(videofilename)
        indexer = Indexer('index.csv', 'video_frames')
        indexer.indexImages()
        search = Search(dataset_path, index_path, query_path, result_path)
        results = search.performSearch()

        output = []
        for i in range(len(results)):
            image = results[i]
            output.append(image)
        return render_template("result.html", images=output)

    return render_template("index_ssd.html")
Example #8
0
def indexit(tokenizer,
            filenames,
            store_positions=False,
            calculate_tfidf=False,
            memory_usage=20):
    indexer = Indexer(tokenizer,
                      'indexer',
                      store_positions=store_positions,
                      max_memory_usage=memory_usage)
    for filename in filenames:
        corpus_reader = CorpusReader(filename)
        indexer.index(corpus_reader)
    indexer.merge(calculate_tfidf)
    return indexer
Example #9
0
 def __init__(self, config=None, run_config=None):
     if not config:
         config = ConfigClass()
     if not run_config:
         run_config = RunConfigClass()
     self._run_config = run_config
     self._config = config
     self._parser = Parse(run_config)
     self._indexer = Indexer(run_config)
     self._model = None
     self.searcher = Searcher(self._parser,
                              self._indexer,
                              run_config,
                              model=self._model)
Example #10
0
def main():
    idx = Indexer()

    scan_queue = [
        # 'http://s28.bitdl.ir/Video/',
        # 'http://128.199.129.79:666/',
        # 'https://korea-dpr.com/mp3/',
        # 'http://46.4.132.219:999/',
        'https://mirror.futureweb.be/manjaro/arm-stable/',
    ]

    urls = idx.scan(scan_queue)

    idx.save('urls.txt', urls)
 def test_highlight_window_one(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_highlight_window.txt', 'w')
     test_file_one.write('Alina Zakharova is a student')
     test_file_one.close()
     self.indexator.get_index_with_line('test_highlight_window.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window('test_highlight_window.txt',
                                                Position_Plus(0, 6, 15), 1)
     result = window.highlight_window()
     output_string = 'Alina <b>Zakharova</b> is'
     self.assertEqual(result, output_string)
     os.remove('test_highlight_window.txt')
Example #12
0
    def __init__(self, config=None):
        self._config = config

        if self._config:
            if not hasattr(self._config, 'toStem'):
                self._config.toStem = False
            if not hasattr(self._config, 'toLemm'):
                self._config.toLemm = False

        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None
        self.corpus_size = 0
        self.load_precomputed_model()
Example #13
0
File: tests.py Project: sankosk/SIW
    def test_index_creation(self):
        self.maxDiff = None

        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(
                text, enable_stemming=False, filter_stopwords=False)
            indexer.index(bag)
        got = indexer.to_dict()

        self.assertSequenceEqual(self.expected["docs_index"],
                                 got["docs_index"])
        self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
 def test_extend_window_rus_one(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_extend_window_rus.txt', 'w')
     test_file_one.write('Пьер с грустью слышал над собою насмешки.')
     test_file_one.close()
     self.indexator.get_index_with_line('test_extend_window_rus.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window(
         'test_extend_window_rus.txt', Position_Plus(0, 0, 4), 1)
     window.extend_window()
     extended_window = Context_Window(
         'Пьер с грустью слышал над собою насмешки.',
         [Position_Plus(0, 0, 4)], 0, 41)
     self.assertEqual(window, extended_window)
 def test_extend_window(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_extend_window.txt', 'w')
     test_file_one.write('Alina Zakharova is a student!!')
     test_file_one.close()
     self.indexator.get_index_with_line('test_extend_window.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window('test_extend_window.txt',
                                                Position_Plus(0, 6, 15), 1)
     window.extend_window()
     extended_window = Context_Window('Alina Zakharova is a student!!',
                                      [Position_Plus(0, 6, 15)], 0, 30)
     self.assertEqual(window, extended_window)
     os.remove('test_extend_window.txt')
 def test_not_crossed(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_not_crossed_window.txt', 'w')
     test_file_one.write('The girl named Alina Zakharova is a student')
     test_file_one.close()
     self.indexator.get_index_with_line('test_not_crossed_window.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window_A = windows.Context_Window.get_window(
         'test_not_crossed_window.txt', Position_Plus(0, 31, 33), 1)
     window_B = windows.Context_Window.get_window(
         'test_not_crossed_window.txt', Position_Plus(0, 8, 14), 1)
     crossed_AB = window_A.is_crossed(window_B)
     self.assertEqual(False, crossed_AB)
     os.remove('test_not_crossed_window.txt')
Example #17
0
File: tests.py Project: sankosk/SIW
    def test_dump(self):
        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(
                text, enable_stemming=False, filter_stopwords=False)
            indexer.index(bag)
        fd = StringIO()
        indexer.dump(fd)
        fd.seek(0)
        got = json.load(fd)

        self.assertSequenceEqual(self.expected["docs_index"],
                                 got["docs_index"])
        self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
Example #18
0
    def __init__(self, dictionary_file, postings_file, rate = 0.01, alpha = 0.1,
                 expand = True, feedback = True, pagerank = True, pivoted = False, score = False):
        self.dictionary_file = dictionary_file
        self.postings_file = postings_file
        self.rate = rate
        self.alpha = alpha
        self.pagerank = pagerank
        self.pivoted = pivoted
        self.score = score

        self.stemmer = PorterStemmer()
        self.indexer = Indexer(dictionary_file, postings_file)
        self.refiner = Refiner(indexer=self.indexer, expand=expand, feedback=feedback)

        self.indexer.LoadDict()
 def create_new_indexer(self):
     candidate_link = self.links_queue.pop(
         timeout=Crawler.POP_TIMEOUT_IN_SECONDS)
     candidate_indexedPage, was_create = IndexedPage.objects.get_or_create(
         pk=candidate_link)
     if was_create:
         Indexer(indexed_page=candidate_indexedPage,
                 on_finished_indexing=self.on_indexer_finished,
                 main_thread_cmd_queue=self.main_thread_cmd_queue,
                 links_queue=self.links_queue).start()
         return True
     else:
         logger.info("Skipping {url}. Index already exists".format(
             url=candidate_link))
         return False
 def __init__(self, config=None):
     """
            init engine with the relevant model - Thesaurus_Searcher
            :param config:
            """
     self._config = config
     try:
         self._reader = ReadFile(corpus_path=config.get__corpusPath())
     except:
         self._reader = ReadFile("")
     self._parser = Parse()
     self._parser.STEMMER = config.toStem
     self._indexer = Indexer(config)
     self._model = Thesaurus_Searcher(self._indexer)
     self.last_parquet = False
Example #21
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-p",
                        "--path",
                        dest="path",
                        default=None,
                        help="Document path")
    parser.add_argument("-t",
                        "--threads",
                        dest="threads",
                        help="Number of threads to launch")
    args = parser.parse_args()
    path = os.path.abspath(args.path) if args.path else "docs"
    threads = int(args.threads) if args.threads else 5
    indexer = Indexer(path, threads)
    indexer.create_index()
Example #22
0
 def start_app(self):
     if self.box is None:
         self.box = Box(self.config)
         saveConfig(self.configPath, self.config)
     indexer = Indexer(self.config['path'], self.config['encryption_key'],
                       self.box)
     indexer.synchronize()
     self.eventList = EventList()
     self.remoteObserver = startRemote(self.config['path'],
                                       self.config['encryption_key'],
                                       self.box, self.eventList)
     self.localObserver = startLocal(self.config['path'],
                                     self.config['encryption_key'],
                                     self.box, self.eventList)
     self.set_icon('img/icon_active.png')
     self.isAppRunning = True
Example #23
0
def picture(structure='reporting',
            datadir=os.getcwd(),
            engine='dot',
            teammembers=True,
            openimage=True):
    """ Render an org chart PNG image and open it.

        STRUCTURE: reporting|teams """
    indexer = Indexer(datadir)
    indexer.load()
    indexer.index()
    orggraph = OrgGraph(indexer, engine, teammembers)
    orggraph.buildgraph(structure)
    imagepath = orggraph.render()
    if openimage:
        showpicture(imagepath)
Example #24
0
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve, word2vec):
    """

    :return:
    """
    # print("start: ", time.asctime(time.localtime(time.time())))
    number_of_documents = 0
    num_of_writes = 1
    config = ConfigClass(corpus_path)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, word2vec)
    # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet')  # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool)

    # Iterate over every document in the file
    counter = 0
    names = r.get_files_names_in_dir()
    for name in names:
        documents_list = r.read_file_by_name(file_name=str(name))
        for idx, document in enumerate(documents_list):
            parsed_document = p.parse_doc(document)  # parse the document
            if parsed_document == {}:  # RT
                continue
            number_of_documents += 1

            indexer.add_new_doc(parsed_document,
                                num_of_writes)  # index the document data
            counter += 1
            if counter >= 500000:
                write_and_clean_buffer(indexer, num_of_writes, stemming,
                                       config, output_path)
                counter = 0
                # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time())))
                num_of_writes += 1
        # print('Finished parsing and indexing. Starting to export files')
    write_and_clean_buffer(indexer, num_of_writes, stemming, config,
                           output_path)
    # print("finish parser & index: ", time.asctime(time.localtime(time.time())))
    indexer.inverted_idx = {
        key: val
        for key, val in indexer.inverted_idx.items() if val != 1
    }
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    # print("finish save index: ", time.asctime(time.localtime(time.time())))

    return num_of_writes
Example #25
0
def main(flist, plist='prefix.conf', dbname='ftp_files.db', xname='xapian.db', verbose=False):
    '''
    Main method: dispatches tasks to catalogue and index remote FTP servers.
    '''
    db = Database(dbname)
    indexer = Indexer(xname, writeable=True)
    
    # Read list of prefixes
    prefixes = []
    with open(plist) as f:
        prefixes = f.read().splitlines()
    
    # Read list of remote FTP servers
    servers = []
    with open(flist) as f:
        servers = f.read().splitlines()
    
    # Compile list of all servers
    for server in servers[:]:
        idx = servers.index(server)
        for prefix in prefixes:
            servers.insert(idx, prefix + '.' + server)
    
    for server in servers:
        if verbose: print "Scanning: %s" % server
        
        # Determine if server is a valid FTP site
        if not is_open_ftp_server(server):
            continue
        
        if verbose: print "\tServer is valid, connecting..."
        
        # Record all files on a remote server
        if not enumerate_files(server, db, verbose=verbose):
            print "\tCould not enumerate files on %s" % server
            continue
        
        # Download text and add to corpus
        if not index_content(server, indexer, db, verbose=verbose):
            print "\tCould not index %s" % server
    
    if verbose: print "\nCataloguing and indexing complete."
    
    # cleanup
    indexer.close()
    db.close()
Example #26
0
def main():
   # File_path is a class that stores the file paths for required documents.
    f = File_path()
    f.declare_paths()
    corpus = Create_corpus(f.raw_files_folder, True, True)
    corpus.parse_files(f.raw_files_folder, f.parsed_file_folder, True , True)

    a = Indexer()
    a.create_unigram_index(f.parsed_file_folder,f.index_file_path)

    c = Context()
    index = c.read_inverted_index(f.index_file_path)
    DL = c.calculate_document_length(f.parsed_file_folder)
    AvDL = c.calculate_avg_doc_length(f.parsed_file_folder)
    q = Query_Parser()
    q.parse_queries(f.query_file_path,f.parsed_query_file_path)

    f1 = open(f.parsed_query_file_path,"r")
    query = dict()
    for lines in f1:
        lines = lines.split(":")
        query[lines[0]] = lines[1].strip()
    bm = BM25WithRelevance("BM25WithRelevance")
    bm.retrieve_bm25_scores(query,f.parsed_file_folder,AvDL,DL,index, f.relevance_file_path, f.output_folder_path)

    tf = Tf_idf("TfIdfRanking")
    tf.retrieve_tfidf_scores(DL,query,index,f.output_folder_path)

    q = QueryLikelihood("QLModel")
    q.retrieve_QL_scores(DL, query,index,f.output_folder_path)

   # task 2 - pseudo relevance feedback
    pr = PseudoRelFeedback()
    pr.PRmain(f.parsed_file_folder,f.index_file_path,f.parsed_query_file_path,f.relevance_file_path,f.stop_file_path,f.output_folder_path)

    # task 3 - stemmed queries
    t = Task3()
    t.driver_stemmed(f)
    t.ranking_with_stopwords(f)

    # phase 2 - Snippet generation
    sg = SnippetGeneration(f.raw_files_folder)
    sg.get_queries(f.parsed_query_file_path)
    output_file_path = f.output_folder_path+"/"+"BM25WithRelevance"+".txt"
    sg.get_ranklist(output_file_path)
    sg.generate_snippet(f.snippet_file)
Example #27
0
 def test_two_files(self):
     test = open("text.txt", 'w' )
     test.write("test")
     test.close()
     test = open("text1.txt", 'w' )
     test.write("my my")
     test.close()
     self.indexer.indexing("text.txt")
     self.indexer = Indexer('database')
     self.indexer.indexing("text1.txt")
     words1 = dict(shelve.open("database"))
     words2 = {
         "my":{"text.txt": [Position(0, 2),
                            Position(3, 5)]},
         "test":{"text.txt": [Position(0, 4)]
     }}
     self.assertEqual(words1, words2)
 def test_extend_window_rus(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_extend_window_rus.txt', 'w')
     test_file_one.write(
         'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.'
     )
     test_file_one.close()
     self.indexator.get_index_with_line('test_extend_window_rus.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window(
         'test_extend_window_rus.txt', Position_Plus(0, 28, 36), 1)
     window.extend_window()
     extended_window = Context_Window(
         'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.',
         [Position_Plus(0, 28, 36)], 22, 55)
     self.assertEqual(window, extended_window)
 def test_extend_window_rus_two(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_extend_window_rus.txt', 'w')
     test_file_one.write(
         'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.'
     )
     test_file_one.close()
     self.indexator.get_index_with_line('test_extend_window_rus.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window(
         'test_extend_window_rus.txt', Position_Plus(0, 34, 38), 1)
     window.extend_window()
     extended_window = Context_Window(
         'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.',
         [Position_Plus(0, 34, 38)], 0, 119)
     self.assertEqual(window, extended_window)
Example #30
0
def run_engine(corpus_path_, output_path_, stemming_):
    """

    :return:
    """

    number_of_documents = 0
    config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_)
    config.corpusPath = corpus_path_
    config.savedFileMainFolder=output_path_
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)

    pathes = r.get_all_path_of_parquet()
    length_of_array = len(pathes)
    iteration = 0
    is_stemmer = config.toStem
    parsed_doc_list = list()
    for i in range(0, length_of_array):
        documents_list = r.get_documents(pathes[i][0], pathes[i][0])
        for doc, j in zip(documents_list, range(len(documents_list))):
            parsed_document = p.parse_doc(doc, stemmer=is_stemmer)
            if parsed_document == None:
                continue
            parsed_doc_list.append(parsed_document)
            number_of_documents += 1
            if number_of_documents % 200000 == 0:
                for doc in parsed_doc_list:
                    indexer.add_new_doc(doc)
                indexer.write_posting_to_txt_file_lower_upper(iteration)
                iteration += 1
                parsed_doc_list.clear()
                parsed_doc_list = list()
            elif j == len(documents_list) - 1 and i == length_of_array - 1:
                for doc in parsed_doc_list:
                    indexer.add_new_doc(doc)
                indexer.write_posting_to_txt_file_lower_upper(iteration)
                parsed_doc_list.clear()
                parsed_doc_list = list()
                indexer.merge_posting_file()
                indexer.merge_two_last_posting_file()
                indexer.split_posting_file_and_create_inverted_index()
                indexer.write_inverted_index_to_txt_file()
                number_of_documents = 0