Beispiel #1
0
def main():
    """ An example how the search engine could be used.  """

    seed = [
        'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html',
        'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html',
        'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html'
    ]

    # Instatiate the crawler.
    crawler = Crawler()

    # Start the crawler with the seed.
    crawler.start_crawling(seed)

    # Access the pages.
    pages = crawler.pages

    # Print the content of the pages
    print(pages)

    # Print the link structure
    link_structure_txt = pages.get_link_structure_text()
    print(link_structure_txt)

    # Printing and creation of the index
    indexer = Indexer()
    indexer.index_pages(pages)
    index = indexer.index
    print(index)

    # Calculation and Printing of Page Rank
    pagerank = Page_Rank()
    pagerank.fill_matrix(crawler)
    pagerank.calculate_probabilities(0.05, 0.95)
    pagerank.calculate_page_rank(0.04)
    print()

    # Scoring
    example_queries = ['tokens', 'index', 'classification', 'tokens classification' ]
    analyzer = CosinusAnalyzer(index, pages)
    print(analyzer.get_length_of_pages_text())

    # Cosinus Scoring
    print(StringUtil.header('cosine_scores.txt'))
    for query in example_queries:
        hits = analyzer.analyze(query)
        print(hits)
    print()

    # Cosinus Scoring combined with the page rank.
    print(StringUtil.header('Cosinus combined with Page Rank'))
    for query in example_queries:
        hits = analyzer.analyze(query, combine_with_page_rank=True)
        print(hits)
    print()
Beispiel #2
0
def OperateData(request):
    message = ''
    try:
        rawdata = Rawdata.objects.filter(operated = False)[0] # get data that has not been operated on yet
        indxr = Indexer()
        indxr.set_raw(rawdata)
        message = indxr.operate()
        return HttpResponse("Finished processing "+rawdata.url.url+"<br/>"+message)
    except IndexError:
        return HttpResponse("All Data have veen operated")
Beispiel #3
0
def OperateData(request):
    message = ''
    try:
        rawdata = Rawdata.objects.filter(
            operated=False)[0]  # get data that has not been operated on yet
        indxr = Indexer()
        indxr.set_raw(rawdata)
        message = indxr.operate()
        return HttpResponse("Finished processing " + rawdata.url.url +
                            "<br/>" + message)
    except IndexError:
        return HttpResponse("All Data have veen operated")
Beispiel #4
0
def run():
    while True:
        try:
            rawdata = Rawdata.objects.filter(operated=False)[
                0]  # get data that has not been operated on yet
            print("Indexing " + rawdata.url.url)
            indxr = Indexer()
            indxr.set_raw(rawdata)
            message = indxr.operate()
            print("Finished processing " + rawdata.url.url)
        except IndexError:
            print("All Data have veen operated")
        time.sleep(1)
    def __init__(self, include_attributes=False):
        """
    Initializes structures and load data into memory, such as the text index and
    the citation graph.
    """
        # Build text index if non-existing
        if not os.path.exists(config.INDEX_PATH):
            indexer = Indexer()
            indexer.add_papers(config.INDEX_PATH, include_text=False)

        # Load text index
        self.index = Index(config.INDEX_PATH, similarity="tfidf")

        # Graph structure that allows fast access to nodes and edges
        self.edges_lookup = GraphBuilder(get_all_edges())

        # If attributes should be fetched and included in the model for each type of node.
        # Should be true for visualization and false for pure relevance calculation.
        self.include_attributes = include_attributes

        # Pre-load the year and venue of each publication for faster access later
        self.pub_years = {}
        self.pub_venues = {}
        rows = db.select(fields=["id", "year", "venue_id"], table="papers")
        for pub, year, venue in rows:
            self.pub_years[str(pub)] = int(year or 0)
            if venue:
                self.pub_venues[pub] = venue

        # Create a helper boolean to check if citation contexts are
        # going to be used (some datasets don't have it available)
        self.use_contexts = (config.DATASET == 'csx')

        # Load vocabulary for the tokens in the citation contexts
        # if self.use_contexts:
        #   self.ctxs_vocab, self.nctx = words.read_vocab(config.CTXS_VOCAB_PATH)

        log.debug("ModelBuilder constructed.")
Beispiel #6
0
    def prepDataAndRun(self):
        indexer = Indexer()
        voca = indexer.get_voca()
        id2word = {}
        word2id = {}
        l = []
        for x in range(len(voca)):
            voca[x] = list(voca[x])
            if voca[x][0] == "content":
                term = voca[x][1].decode("utf-8")
                word2id[term] = x
                id2word[x] = term
                l.append(term)
        voca = l

        fp = open('bowcorpus.json', 'r')
        corpus = json.load(fp)
        bows = [bow for (id, bow) in corpus]
        idbows = []

        for bow in bows:
            idbow = [(word2id[word], freq) for word, freq in bow]
            idbows.append(idbow)
        LDA.build_lda_model(idbows, id2word)
Beispiel #7
0
 def __init__(self):
     if IRModel.__instance is not None:
         raise Exception("Singleton bla")
     else:
         start_time = datetime.datetime.now()
         print("Starting initialization of IR Model! at " + str(start_time))
         self.indexer = Indexer()
         # self.dbHandler = DbHandler()
         # self.reputation_scores = ReputationScores()
         # self.authors = AuthorClustering(cache_enabled=True)
         # self.lda = LDA()
         IRModel.__instance = self
         end_time = datetime.datetime.now()
         print("Finished initialization of IR Model! at " + str(end_time) +
               "\nIt took: " + str(end_time - start_time))
# Prints Usage
def usage():
    print "usage: python index.py /path/to/data"


if __name__ == "__main__":

    logging.basicConfig(level=logging.INFO)

    args = sys.argv
    if len(args) < 2 or len(args) > 2:
        usage()
        sys.exit()

    indexer = Indexer(NUM_INDEXER_THREADS)
    indexer.open()

    a = datetime.datetime.now()
    logging.info("Starting pass 1")
    for filename in os.listdir(args[1]):
        indexer.index_document(os.path.join(args[1], filename))
    indexer.close()
    logging.info("Pass 1 done")

    logging.info("Starting pass 2")
    indexer.build_index()
    logging.info("Pass 2 done")
    b = datetime.datetime.now()


# Prints Usage
def usage():
    print "usage: python index.py /path/to/data"

if __name__ == "__main__":

    logging.basicConfig(level=logging.INFO)
    
    args = sys.argv
    if len(args) < 2 or len(args) > 2:
        usage()    
        sys.exit()

    indexer = Indexer(NUM_INDEXER_THREADS)
    indexer.open()

    a = datetime.datetime.now()
    logging.info("Starting pass 1")
    for filename in os.listdir(args[1]):
        indexer.index_document(os.path.join(args[1], filename))
    indexer.close()
    logging.info("Pass 1 done")

    logging.info("Starting pass 2")
    indexer.build_index()
    logging.info("Pass 2 done")
    b = datetime.datetime.now()

import gensim
from indexer.indexer import Indexer
from db_handler import DbHandler
import json
import csv

indexer = Indexer()
corpusWithIds = indexer.get_index_information()
fp = open('indexAsBow.json', 'w')
json.dump(corpusWithIds, fp)
corpusWithoutIds = [bow for (id, bow) in corpusWithIds]
ids = [id for (id, bow) in corpusWithIds]
print(ids)

dictionary = gensim.corpora.Dictionary.load('nipsFilter50.dict')
corpus = gensim.corpora.MmCorpus('filtered_nips2.mm')

model = gensim.models.LdaModel.load('ldaFilter50.lda')

with open('topDocsForTopics.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    for topicnr in range(model.num_topics):
        docs = sorted(zip(ids, model[corpus]),
                      reverse=True,
                      key=lambda x: abs(dict(x[1]).get(topicnr, 0.0)))
        for (docid, topics) in docs[:10]:
            for (topicid, prob) in topics:
                if topicid == topicnr:
                    writer.writerow([topicnr, docid, prob])
                    break
            print(docid, topics)
Beispiel #11
0
from indexer.indexer import Indexer
import sys

# python3 load_vectors.py ./datasets/ CISI.ALL.json CISI
if __name__ == '__main__':
    base_path = sys.argv[1]
    ext = sys.argv[2]
    name = sys.argv[3]
    idx = Indexer(base_path, ext, name)
    print(idx.vectorsPath)
    print(idx.keywordsPath)
Beispiel #12
0
def index_attributes(names, ingredients, steps):
    # all
    i = Indexer(names + ingredients + steps)
    index_names = i.get_index()
    dump_to_json('all', index_names)
    # names
    i = Indexer(names)
    index_names = i.get_index('name')
    dump_to_json('name', index_names)
    # ingredients
    i = Indexer(ingredients)
    index_ingredients = i.get_index('ingredients')
    dump_to_json('ingredients', index_ingredients)
    # steps
    i = Indexer(steps)
    index_steps = i.get_index('steps')
    dump_to_json('steps', index_steps)
Beispiel #13
0
random.seed(42)
random.shuffle(imagePaths)

# determine the set of possible class labels from the image dataset assuming
# that the images are in {directory}/{filename} structure and create the
# label encoder
print("[INFO] encoding labels...")
le = LabelEncoder()
le.fit([p.split("/")[-2] for p in imagePaths])

# initialize the Overfeat extractor and the Overfeat indexer
print("[INFO] initializing network...")
oe = Extractor(conf["model"])
featuresPath = conf["features_path"][
    0:conf["features_path"].rfind(".")] + "-" + conf["model"] + ".hdf5"
oi = Indexer(featuresPath, estNumImages=len(imagePaths))
print("[INFO] starting feature extraction...")

# loop over the image paths in batches
for (i, paths) in enumerate(dataset.chunk(imagePaths, conf["batch_size"])):
    # load the set of images from disk and describe them
    (labels, images) = dataset.build_batch(paths, conf["model"])
    features = oe.describe(images)

    # loop over each set of (label, vector) pair and add them to the indexer
    for (label, vector) in zip(labels, features):
        oi.add(label, vector)

    # check to see if progress should be displayed
    if i > 0:
        oi._debug("processed {} images".format((i + 1) * conf["batch_size"],
Beispiel #14
0
import sys

from PySide.QtCore import Qt
from PySide.QtGui import QApplication, QPixmap, QSplashScreen

from dialog.directory_dialog import DirectoryDialog
from indexer.indexer import Indexer
from gui.mainwindow import MainWindow

if __name__ == "__main__":
    app = QApplication(sys.argv)
    dir = DirectoryDialog()
    if dir.exec_() and dir.result() != "" and dir.result() != None:
        app.indexer = Indexer(dir.result())
        splash_pix = QPixmap('res/SplashScreen.png')
        splash = QSplashScreen(splash_pix, Qt.WindowStaysOnTopHint)
        splash.setMask(splash_pix.mask())
        splash.show()
        app.processEvents()
        app.indexer.load_data()
        app.doclist = None
        app.webview = None
        app.currentWord = None
        app.mainWindow = MainWindow()
        splash.finish(app.mainWindow)
        app.mainWindow.show()
        sys.exit(app.exec_())
    else:
        app.quit()
Beispiel #15
0
from watcher.Handler import Handler
from watchdog.observers import Observer
from indexer.indexer import Indexer
from watchdog.events import FileSystemEventHandler
import os

if __name__ == '__main__':
    web_dir_observer = Observer()
    event_handler = Handler()
    web_dir_observer.schedule(event_handler, path='docs_web/')
    web_dir_observer.start()
    nombresDocumentos = os.listdir("docs_web/")
    indexer = Indexer(list(map(lambda x: "docs_web/" + x, nombresDocumentos)))

    try:
        while True:
            word = input("----ingresa termino a buscar: ")
            indexer.searchIndex(word)

            is_change = event_handler.get_val(
            )  #check if new files were created & get names collected by handler
            if len(is_change) is not 0:
                indexer.update_indexer(is_change)
    except KeyboardInterrupt:
        web_dir_observer.stop()
    # sleep until keyboard interrupt, then stop + rejoin the observer
    web_dir_observer.join()