def main(): """ An example how the search engine could be used. """ seed = [ 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html' ] # Instatiate the crawler. crawler = Crawler() # Start the crawler with the seed. crawler.start_crawling(seed) # Access the pages. pages = crawler.pages # Print the content of the pages print(pages) # Print the link structure link_structure_txt = pages.get_link_structure_text() print(link_structure_txt) # Printing and creation of the index indexer = Indexer() indexer.index_pages(pages) index = indexer.index print(index) # Calculation and Printing of Page Rank pagerank = Page_Rank() pagerank.fill_matrix(crawler) pagerank.calculate_probabilities(0.05, 0.95) pagerank.calculate_page_rank(0.04) print() # Scoring example_queries = ['tokens', 'index', 'classification', 'tokens classification' ] analyzer = CosinusAnalyzer(index, pages) print(analyzer.get_length_of_pages_text()) # Cosinus Scoring print(StringUtil.header('cosine_scores.txt')) for query in example_queries: hits = analyzer.analyze(query) print(hits) print() # Cosinus Scoring combined with the page rank. print(StringUtil.header('Cosinus combined with Page Rank')) for query in example_queries: hits = analyzer.analyze(query, combine_with_page_rank=True) print(hits) print()
def OperateData(request): message = '' try: rawdata = Rawdata.objects.filter(operated = False)[0] # get data that has not been operated on yet indxr = Indexer() indxr.set_raw(rawdata) message = indxr.operate() return HttpResponse("Finished processing "+rawdata.url.url+"<br/>"+message) except IndexError: return HttpResponse("All Data have veen operated")
def OperateData(request): message = '' try: rawdata = Rawdata.objects.filter( operated=False)[0] # get data that has not been operated on yet indxr = Indexer() indxr.set_raw(rawdata) message = indxr.operate() return HttpResponse("Finished processing " + rawdata.url.url + "<br/>" + message) except IndexError: return HttpResponse("All Data have veen operated")
def run(): while True: try: rawdata = Rawdata.objects.filter(operated=False)[ 0] # get data that has not been operated on yet print("Indexing " + rawdata.url.url) indxr = Indexer() indxr.set_raw(rawdata) message = indxr.operate() print("Finished processing " + rawdata.url.url) except IndexError: print("All Data have veen operated") time.sleep(1)
def __init__(self, include_attributes=False): """ Initializes structures and load data into memory, such as the text index and the citation graph. """ # Build text index if non-existing if not os.path.exists(config.INDEX_PATH): indexer = Indexer() indexer.add_papers(config.INDEX_PATH, include_text=False) # Load text index self.index = Index(config.INDEX_PATH, similarity="tfidf") # Graph structure that allows fast access to nodes and edges self.edges_lookup = GraphBuilder(get_all_edges()) # If attributes should be fetched and included in the model for each type of node. # Should be true for visualization and false for pure relevance calculation. self.include_attributes = include_attributes # Pre-load the year and venue of each publication for faster access later self.pub_years = {} self.pub_venues = {} rows = db.select(fields=["id", "year", "venue_id"], table="papers") for pub, year, venue in rows: self.pub_years[str(pub)] = int(year or 0) if venue: self.pub_venues[pub] = venue # Create a helper boolean to check if citation contexts are # going to be used (some datasets don't have it available) self.use_contexts = (config.DATASET == 'csx') # Load vocabulary for the tokens in the citation contexts # if self.use_contexts: # self.ctxs_vocab, self.nctx = words.read_vocab(config.CTXS_VOCAB_PATH) log.debug("ModelBuilder constructed.")
def prepDataAndRun(self): indexer = Indexer() voca = indexer.get_voca() id2word = {} word2id = {} l = [] for x in range(len(voca)): voca[x] = list(voca[x]) if voca[x][0] == "content": term = voca[x][1].decode("utf-8") word2id[term] = x id2word[x] = term l.append(term) voca = l fp = open('bowcorpus.json', 'r') corpus = json.load(fp) bows = [bow for (id, bow) in corpus] idbows = [] for bow in bows: idbow = [(word2id[word], freq) for word, freq in bow] idbows.append(idbow) LDA.build_lda_model(idbows, id2word)
def __init__(self): if IRModel.__instance is not None: raise Exception("Singleton bla") else: start_time = datetime.datetime.now() print("Starting initialization of IR Model! at " + str(start_time)) self.indexer = Indexer() # self.dbHandler = DbHandler() # self.reputation_scores = ReputationScores() # self.authors = AuthorClustering(cache_enabled=True) # self.lda = LDA() IRModel.__instance = self end_time = datetime.datetime.now() print("Finished initialization of IR Model! at " + str(end_time) + "\nIt took: " + str(end_time - start_time))
# Prints Usage def usage(): print "usage: python index.py /path/to/data" if __name__ == "__main__": logging.basicConfig(level=logging.INFO) args = sys.argv if len(args) < 2 or len(args) > 2: usage() sys.exit() indexer = Indexer(NUM_INDEXER_THREADS) indexer.open() a = datetime.datetime.now() logging.info("Starting pass 1") for filename in os.listdir(args[1]): indexer.index_document(os.path.join(args[1], filename)) indexer.close() logging.info("Pass 1 done") logging.info("Starting pass 2") indexer.build_index() logging.info("Pass 2 done") b = datetime.datetime.now()
import gensim from indexer.indexer import Indexer from db_handler import DbHandler import json import csv indexer = Indexer() corpusWithIds = indexer.get_index_information() fp = open('indexAsBow.json', 'w') json.dump(corpusWithIds, fp) corpusWithoutIds = [bow for (id, bow) in corpusWithIds] ids = [id for (id, bow) in corpusWithIds] print(ids) dictionary = gensim.corpora.Dictionary.load('nipsFilter50.dict') corpus = gensim.corpora.MmCorpus('filtered_nips2.mm') model = gensim.models.LdaModel.load('ldaFilter50.lda') with open('topDocsForTopics.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',') for topicnr in range(model.num_topics): docs = sorted(zip(ids, model[corpus]), reverse=True, key=lambda x: abs(dict(x[1]).get(topicnr, 0.0))) for (docid, topics) in docs[:10]: for (topicid, prob) in topics: if topicid == topicnr: writer.writerow([topicnr, docid, prob]) break print(docid, topics)
from indexer.indexer import Indexer import sys # python3 load_vectors.py ./datasets/ CISI.ALL.json CISI if __name__ == '__main__': base_path = sys.argv[1] ext = sys.argv[2] name = sys.argv[3] idx = Indexer(base_path, ext, name) print(idx.vectorsPath) print(idx.keywordsPath)
def index_attributes(names, ingredients, steps): # all i = Indexer(names + ingredients + steps) index_names = i.get_index() dump_to_json('all', index_names) # names i = Indexer(names) index_names = i.get_index('name') dump_to_json('name', index_names) # ingredients i = Indexer(ingredients) index_ingredients = i.get_index('ingredients') dump_to_json('ingredients', index_ingredients) # steps i = Indexer(steps) index_steps = i.get_index('steps') dump_to_json('steps', index_steps)
random.seed(42) random.shuffle(imagePaths) # determine the set of possible class labels from the image dataset assuming # that the images are in {directory}/{filename} structure and create the # label encoder print("[INFO] encoding labels...") le = LabelEncoder() le.fit([p.split("/")[-2] for p in imagePaths]) # initialize the Overfeat extractor and the Overfeat indexer print("[INFO] initializing network...") oe = Extractor(conf["model"]) featuresPath = conf["features_path"][ 0:conf["features_path"].rfind(".")] + "-" + conf["model"] + ".hdf5" oi = Indexer(featuresPath, estNumImages=len(imagePaths)) print("[INFO] starting feature extraction...") # loop over the image paths in batches for (i, paths) in enumerate(dataset.chunk(imagePaths, conf["batch_size"])): # load the set of images from disk and describe them (labels, images) = dataset.build_batch(paths, conf["model"]) features = oe.describe(images) # loop over each set of (label, vector) pair and add them to the indexer for (label, vector) in zip(labels, features): oi.add(label, vector) # check to see if progress should be displayed if i > 0: oi._debug("processed {} images".format((i + 1) * conf["batch_size"],
import sys from PySide.QtCore import Qt from PySide.QtGui import QApplication, QPixmap, QSplashScreen from dialog.directory_dialog import DirectoryDialog from indexer.indexer import Indexer from gui.mainwindow import MainWindow if __name__ == "__main__": app = QApplication(sys.argv) dir = DirectoryDialog() if dir.exec_() and dir.result() != "" and dir.result() != None: app.indexer = Indexer(dir.result()) splash_pix = QPixmap('res/SplashScreen.png') splash = QSplashScreen(splash_pix, Qt.WindowStaysOnTopHint) splash.setMask(splash_pix.mask()) splash.show() app.processEvents() app.indexer.load_data() app.doclist = None app.webview = None app.currentWord = None app.mainWindow = MainWindow() splash.finish(app.mainWindow) app.mainWindow.show() sys.exit(app.exec_()) else: app.quit()
from watcher.Handler import Handler from watchdog.observers import Observer from indexer.indexer import Indexer from watchdog.events import FileSystemEventHandler import os if __name__ == '__main__': web_dir_observer = Observer() event_handler = Handler() web_dir_observer.schedule(event_handler, path='docs_web/') web_dir_observer.start() nombresDocumentos = os.listdir("docs_web/") indexer = Indexer(list(map(lambda x: "docs_web/" + x, nombresDocumentos))) try: while True: word = input("----ingresa termino a buscar: ") indexer.searchIndex(word) is_change = event_handler.get_val( ) #check if new files were created & get names collected by handler if len(is_change) is not 0: indexer.update_indexer(is_change) except KeyboardInterrupt: web_dir_observer.stop() # sleep until keyboard interrupt, then stop + rejoin the observer web_dir_observer.join()