Beispiel #1
0
    def start(self):

        ### IMPORTS

        self.retriever = Retriever()
        self.time = __import__('time')
        self.json = __import__('json')
        self.re = __import__('re')

        ### INTRO ART

        rows, columns = os.popen('stty size', 'r').read().split()

        asci = ""
        f = open('assets/logo.txt', 'r')
        logoArt = f.read()
        f.close()
        for line in logoArt.split('\n'):
            #asci=asci+'\n'+' '*(int(0.5*int(columns) )-int(0.5*len(line)))+line
            asci = asci + '\n' + Fore.YELLOW + ' ' * 24 + line
        asci += '\n'
        asci += '\n' + ' ' * (int(0.5 * int(columns)) -
                              int(0.5 * len('InstaMiner'))) + 'InstaMiner'
        asci += '\n' + ' ' * (int(0.5 * int(columns)) -
                              int(0.5 * len('v' + self.v))) + 'v' + self.v
        asci += '\n\n' + ' ' * (int(0.5 * int(columns)) - int(
            0.5 * len('Type "help" for help'))) + 'Type "help" for help'

        self.intro = asci

        ### INIT

        self._buffer_start()
        self.cmdloop()
def main(args, local_rank=0):

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)

    vocabs = dict()
    vocabs['src'] = Vocab(args.src_vocab, 0, [BOS, EOS])
    vocabs['tgt'] = Vocab(args.tgt_vocab, 0, [BOS, EOS])

    logger.info(args)
    for name in vocabs:
        logger.info("vocab %s, size %d, coverage %.3f", name,
                    vocabs[name].size, vocabs[name].coverage)

    set_seed(19940117)

    #device = torch.device('cpu')
    torch.cuda.set_device(local_rank)
    device = torch.device('cuda', local_rank)

    logger.info("start building model")
    logger.info("building retriever")
    if args.add_retrieval_loss:
        retriever, another_model = Retriever.from_pretrained(
            args.num_retriever_heads,
            vocabs,
            args.retriever,
            args.nprobe,
            args.topk,
            local_rank,
            load_response_encoder=True)
        matchingmodel = MatchingModel(retriever.model, another_model)
        matchingmodel = matchingmodel.to(device)
    else:
        retriever = Retriever.from_pretrained(args.num_retriever_heads, vocabs,
                                              args.retriever, args.nprobe,
                                              args.topk, local_rank)

    logger.info("building retriever + generator")
    model = RetrieverGenerator(vocabs, retriever, args.share_encoder,
                               args.embed_dim, args.ff_embed_dim,
                               args.num_heads, args.dropout, args.mem_dropout,
                               args.enc_layers, args.dec_layers,
                               args.mem_enc_layers, args.label_smoothing)

    model = model.to(device)

    model.eval()
    dev_data = DataLoader(vocabs,
                          args.dev_data,
                          args.dev_batch_size,
                          for_train=False)
    bleu = validate(device,
                    model,
                    dev_data,
                    beam_size=5,
                    alpha=0.6,
                    max_time_step=10)
Beispiel #3
0
def phase2(model):
    stop = Stopper()
    stopped_corpus = stop.build_stopped_inverted_index()
    stop_inv_index = stopped_corpus[0]
    stop_total_corpus = stopped_corpus[1]
    task3a_folder = os.path.join(os.getcwd(), 'task3a')
    file_name = "task3a_cosine_stopped.txt"
    r = Retriever()
    fa = FileAccess()
    relevance_data = fa.get_relevance_data()
    query_dict = fa.read_queries()
    result_file = task3a_folder + '/' + file_name
    stopped_queries = stop.get_stopped_queries(query_dict)
    qe = QueryExpander(query_dict=stopped_queries,
                       filename=result_file,
                       clean=False)
    expanded_stopped_queries = qe.get_expanded_queries()
    r.run_all_queries(inverted_index=stop_inv_index,
                      total_corpus=stop_total_corpus,
                      relevance_data=relevance_data,
                      query_dict=expanded_stopped_queries,
                      model=model,
                      task_id="phase2",
                      notes="stopped_expanded",
                      store_queries='stopped_expanded')
Beispiel #4
0
 def __init__(self):
     self.retriever = Retriever() #to use the filename function
     self.headers = {    
       'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
       'Accept' : 'text/xml,application/xml,application/xhtml+xml,\
         text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
       'Accept-Language' : 'fr-fr,en-us;q=0.7,en;q=0.3',
       'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
     }
Beispiel #5
0
def result_handler():
    """
    Post handler for request data
    Adds request data into sqlite database
    First pulls data in database, but if request ingredients are not in database ingredients,
        pulls from request URL and adds to database
    """
    desired_handlers = ['allrecipes', 'foodnetwork', 'epicurious']
    user_ingredients = [request.form[task] for task in request.form]
    ingredients = []

    for i in sorted(user_ingredients):
        a = i.lower()
        j = a.replace(' ', '')
        ingredients.append(j)

    exists = DBSession.query(Recipes).get(str(ingredients))

    if exists:
        ar_recipe_urls = ast.literal_eval(exists.ar_recipe_url)
        ar_recipe_names = ast.literal_eval(exists.ar_recipe_name)

        fn_recipe_urls = ast.literal_eval(exists.fn_recipe_url)
        fn_recipe_names = ast.literal_eval(exists.fn_recipe_name)

        ep_recipe_urls = ast.literal_eval(exists.ep_recipe_url)
        ep_recipe_names = ast.literal_eval(exists.ep_recipe_name)

    else:
        ar_recipe_links = list(
            Retriever(ingredients)(desired_handlers)[0]['allrecipes'])
        ar_recipe_urls = ar_recipe_links[0]
        ar_recipe_names = ar_recipe_links[1]

        fn_recipe_links = list(
            Retriever(ingredients)(desired_handlers)[0]['foodnetwork'])
        fn_recipe_urls = fn_recipe_links[0]
        fn_recipe_names = fn_recipe_links[1]

        ep_recipe_links = list(
            Retriever(ingredients)(desired_handlers)[0]['epicurious'])
        ep_recipe_urls = ep_recipe_links[0]
        ep_recipe_names = ep_recipe_links[1]

        ingredient_add = Recipes(ingredients=str(ingredients),
                                 ar_recipe_url=str(ar_recipe_urls),
                                 ar_recipe_name=str(ar_recipe_names),
                                 fn_recipe_url=str(fn_recipe_urls),
                                 fn_recipe_name=str(fn_recipe_names),
                                 ep_recipe_url=str(ep_recipe_urls),
                                 ep_recipe_name=str(ep_recipe_names))
        DBSession.add(ingredient_add)
        DBSession.commit()

    return ar_recipe_urls, ar_recipe_names, fn_recipe_urls, fn_recipe_names, ep_recipe_urls, ep_recipe_names
    def setUp(self):
        self.retriever = Retriever()
        self.output = []

        def fake_print(content):
            self.output.append(content)

        self.retriever.print = fake_print

        self.text_response = '{"success": true, "data": [209], "length": 1, "type": "uint8"}'
        self.json_response = {'type': 'uint8', 'data': [209], 'length': 1, 'success': True}
Beispiel #7
0
class LinkAnalyzer(object):
    """creates abstract documents and feeds their attributes
   """
    def __init__(self):
        shelve('database1', 'c')
        self.term_extractor = parser.ExtractTerms()
        self.retriever = Retriever()

    def analyze(self, url, links):
        """creates a document and sets its outgoing links
      """
        self.db = shelve('database1', 'w')
        key = md5(url).hexdigest()
        #if the document is already in the database, just add its outgoing links

        if key in self.db.iterkeys():
            doc = self.db[key]
            doc.insertOL(links)
            doc.url = url
            document = open(self.retriever.filename(url)).read()
            doc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            doc.unique_terms_freq = self.term_extractor.count_term_frequencies(
                unique_terms, document)
            #print self.db[key].outgoingLinks
        #if there is no document for the url, create a document and add its outgoing links
        if key not in self.db.iterkeys():
            newDoc = Document(url)
            newDoc.insertOL(links)
            newDoc.url = url
            document = open(self.retriever.filename(url)).read()
            newDoc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(
                unique_terms, document)
            self.db[key] = newDoc
            #print self.db[key].outgoingLinks
        #self.extractLinksfromResponse(url,links)
        self.db.close()

    def extractLinksfromResponse(self, url, links):
        """analyses the incoming links from the response
      """
        for link in links:
            key = md5(link).hexdigest()
            if key in self.db.iterkeys():
                doc = self.db[key]
                doc.insertIL(url)
            else:
                newDo = Document(link)
                newDo.insertIL(url)
                #print type(newDo)
                #print type(key)
                self.db[key] = newDo
Beispiel #8
0
 def __init__(self):
     self.visited_links = []
     self.links_queue = deque([])
     self.domain = ''
     self.same_domain = True
     self.pageRetriever = Retriever()
     self.downloader = Downloader()
     self.linkanalyser = linkanalyser.LinkAnalyzer()
     logging.basicConfig(filename='crawler.log',
                         format='%(levelname)s:%(message)s',
                         level=logging.INFO)
class LinkAnalyzer(object):
    """creates abstract documents and feeds their attributes
   """

    def __init__(self):
        shelve("database1", "c")
        self.term_extractor = parser.ExtractTerms()
        self.retriever = Retriever()

    def analyze(self, url, links):
        """creates a document and sets its outgoing links
      """
        self.db = shelve("database1", "w")
        key = md5(url).hexdigest()
        # if the document is already in the database, just add its outgoing links

        if key in self.db.iterkeys():
            doc = self.db[key]
            doc.insertOL(links)
            doc.url = url
            document = open(self.retriever.filename(url)).read()
            doc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            doc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document)
            # print self.db[key].outgoingLinks
        # if there is no document for the url, create a document and add its outgoing links
        if key not in self.db.iterkeys():
            newDoc = Document(url)
            newDoc.insertOL(links)
            newDoc.url = url
            document = open(self.retriever.filename(url)).read()
            newDoc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document)
            self.db[key] = newDoc
            # print self.db[key].outgoingLinks
        # self.extractLinksfromResponse(url,links)
        self.db.close()

    def extractLinksfromResponse(self, url, links):
        """analyses the incoming links from the response
      """
        for link in links:
            key = md5(link).hexdigest()
            if key in self.db.iterkeys():
                doc = self.db[key]
                doc.insertIL(url)
            else:
                newDo = Document(link)
                newDo.insertIL(url)
                # print type(newDo)
                # print type(key)
                self.db[key] = newDo
Beispiel #10
0
 def __init__(self, filename, query_dict, top_k=12, n=5, clean=True):
     r = Retriever()
     if not clean:
         self.total_corpus = r.get_total_corpus(folder='stopped')
     else:
         self.total_corpus = r.get_total_corpus(folder='clean')
     self.k = top_k
     self.n = n
     fa = FileAccess()
     self.query_dict = query_dict
     self.results = fa.read_result_file(filename=filename)
     return
Beispiel #11
0
 def retrieve_remote_video(self, records):
     """
     Retrieve video from remote server.
     """
     for key, val in records.items():
         torrent_path = (self.DEFAULT_REMOTE_VIDEO_DIR+val).encode('utf-8')
         #existence = exists(torrent_path, use_sudo=False, verbose=True)
         #if not existence:
         #    print (red('Downloaded video not exists on path - %s' %torrent_path))
         #    continue
         retriever = Retriever(torrent_path)
         retriever.start()
         self.stop_seeding(key)
Beispiel #12
0
class Downloader(object):
    """There are two downloaders
    download() uses the urllib2 module
    CDownload() uses curl to download the pages. This results in fast page downloads
    """
    def __init__(self):
        self.retriever = Retriever() #to use the filename function
        self.headers = {    
          'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
          'Accept' : 'text/xml,application/xml,application/xhtml+xml,\
            text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
          'Accept-Language' : 'fr-fr,en-us;q=0.7,en;q=0.3',
          'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
        }
    
    
    def download(self,url):
        """downloads the webpage indicated by url and saves it in a 
        file with an absolute path as that of the url
        """                  
        reqObj = urllib2.Request(url, None, self.headers)
        try:
            urlObj = urllib2.urlopen(reqObj)
            response = urlObj.readlines()
        except Exception:
            return 
        #write the content of the response object to the file
        file = open(self.retriever.filename(url), 'w')
        
        for line in response:
            file.writelines(line)
        print url + "**** crawled"
        
        logging.info("* crawled %s \n"%url)
        file.close()
            
        return 1         

    def CDownload(self, url):
        try:
            file_name = self.retriever.filename(url)
            #curl downloads the file and writes it into a file
            os.system("curl %s -o %s"%(url, file_name))
            print url + "**** crawled"
            logging.info("* crawled %s \n"%url)
            response=open(file_name, "r").read()
        except IOError:
            return 0
        
        return response 
Beispiel #13
0
    def __init__(self, config, config_key):

        self.config = config
        self.config_key = config_key

        # For Understanding User Inputs
        self.input_yes_or_no = json.load(
            open(config[config_key]['input_yes_or_no']))
        self.input_about = json.load(open(config[config_key]['input_about']))
        self.location_file_path = config[config_key]['location']
        self.cuisine_file_path = config[config_key]['cuisine']
        self.food_file_path = config[config_key]['food']

        # For Generating Responses
        self.response_greetings = json.load(
            open(config[config_key]['response_greetings']))
        self.response_yes_no = json.load(
            open(config[config_key]['response_yes_no']))
        self.response_about = json.load(
            open(config[config_key]['response_about']))
        self.response_general = json.load(
            open(config[config_key]['response_general']))
        self.response_for_business = json.load(
            open(config[config_key]['response_for_business']))

        # Helpers
        self.retriever = Retriever(config, config_key)

        # read in valid locations
        with open(self.location_file_path, 'r') as location_file:
            self.valid_locations = [
                location.lower()
                for location in location_file.read().splitlines()
            ]

        # read in cuisines
        with open(self.cuisine_file_path, 'r') as cuisine_file:
            self.known_cuisines = [
                cuisine.lower()
                for cuisine in cuisine_file.read().splitlines()
            ]

        # read in food
        with open(self.food_file_path, 'r') as food_file:
            self.known_foods = [
                food.lower() for food in food_file.read().splitlines()
            ]

        self.state_after_response = State.understood_nothing
Beispiel #14
0
def main():
    '''Things to do when this module is called'''

    r = Retriever()
    c = raw_input('Enter a search term (q to quit): ')
    ask = True

    while not should_terminate(c):
        i = 0
        ask = True
        print('\n############ RESULTS ############\n')

        link_pq = get_links(c, r)
        for tuple in link_pq:
            link = tuple[1]

            if print_url(link):
                i += 1

            if i != 0 and i % 10 == 0:
                print('#################################\n')
                print("Press Enter to list more results.")
                c = raw_input('Enter a search term (q to quit): ')
                if c != '':
                    ask = False
                    break
                print('\n#################################\n')

        if ask:
            print('#################################\n')
            c = raw_input('Enter a search term (q to quit): ')

    print('\n############\n# Goodbye! #\n############\n')
Beispiel #15
0
    def start(self):
        try:
            with open(config.statusFile, "r") as f:
                if f.read() == config.STATUS_LINE:
                    return
        except:
            pass
        try:
            with open(config.statusFile, "w") as f:
                f.write(config.STATUS_LINE)
        except IOError as e:
            print >>sys.stderr, e
            return

        if self.daemonize():
            return

        self.startTime = datetime.now()
        self.storage = Storage()
        self.btserver = BtServer(self.storage)
        self.retriever = Retriever(self.storage)
        self.btserver.start()
        self.retriever.start()

        daemon = Pyro4.Daemon(port=config.pyroPort)
        uri = daemon.register(self, config.PNAME)
        log.info("Eris daemon URI: [{}]".format(uri))
        self.running = True
        daemon.requestLoop(loopCondition=lambda: self.running)

        daemon.unregister(config.PNAME)
        daemon.close()
Beispiel #16
0
def load_retriever(args,
                   device,
                   task_tokenizer,
                   retriever_tokenizer,
                   finetuned_path=None,
                   stored_index=None,
                   train_use_idx=None):
    print(
        f"\nLoading retriever: {finetuned_path if finetuned_path is not None else args.retrieval_model}\n"
    )
    config = AutoConfig.from_pretrained(args.retrieval_model)
    config.__dict__.update(args.__dict__)
    model = Retriever.from_pretrained(args.retrieval_model,
                                      config=config,
                                      cache_dir=args.cache_dir,
                                      task_tokenizer=task_tokenizer,
                                      retriever_tokenizer=retriever_tokenizer,
                                      stored_index=stored_index,
                                      train_use_idx=train_use_idx)
    model.resize_token_embeddings(len(retriever_tokenizer))
    if args.reinitialize_retriever and finetuned_path is None:
        model.init_weights()
    if finetuned_path is not None:
        model_state_dict = torch.load(
            finetuned_path, map_location=lambda storage, loc: storage
        )  # args for preventing memory leakage across gpus
        utils.rectify_mismatched_embeddings(model, model_state_dict,
                                            retriever_tokenizer)
        model.load_state_dict(model_state_dict)
    model = model.to(device)
    return model
Beispiel #17
0
def get_model_and_tokenizer(args):
    model_config = UnilmConfig.from_pretrained(
        args.config_name if args.config_name else 'unilm-base-cased',
        cache_dir=args.cache_dir if args.cache_dir else None)
    config = BertForSeq2SeqConfig.from_exist_config(
        config=model_config,
        label_smoothing=args.label_smoothing,
        max_position_embeddings=args.max_source_seq_length +
        args.max_target_seq_length)

    logger.info("Model config for seq2seq: %s", str(config))

    tokenizer = UnilmTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else 'unilm-base-cased',
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)

    generator = BertForSequenceToSequence.from_pretrained(
        'unilm-base-cased',
        config=config,
        model_type='unilm',
        reuse_position_embedding=True,
        cache_dir=args.cache_dir if args.cache_dir else None)
    generator.to(args.device)

    classifer = Classifier(config.hidden_size, args.num_labels)
    classifer.to(args.device)

    logger.info("Initialize retriever.")
    retriever = Retriever(args, tokenizer)
    return generator, classifer, tokenizer, retriever
Beispiel #18
0
def task3b(model):
    stem = Stemmer()
    r = Retriever()
    stem_total_corpus = stem.build_stemmed_data()
    stem_inv_index = stem.build_stemmed_index()
    fa = FileAccess()
    relevance_data = fa.get_relevance_data()
    stemmed_queries = fa.get_stem_queries()

    r.run_all_queries(inverted_index=stem_inv_index,
                      total_corpus=stem_total_corpus,
                      relevance_data=relevance_data,
                      query_dict=stemmed_queries,
                      model=model,
                      task_id="3b",
                      notes="stemmed",
                      store_queries='stemmed')
Beispiel #19
0
def task1(notes=''):
    r = Retriever()
    fa = FileAccess()
    query_dict = fa.read_queries()
    corpus = r.get_corpus(True)
    inverted_index = corpus[0]
    total_corpus = corpus[1]
    relevance_data = fa.get_relevance_data()

    for model in models:
        r.run_all_queries(inverted_index=inverted_index,
                          total_corpus=total_corpus,
                          relevance_data=relevance_data,
                          query_dict=query_dict,
                          model=model,
                          task_id="1",
                          notes=notes)
Beispiel #20
0
def task3a(model):
    stop = Stopper()
    stopped_corpus = stop.build_stopped_inverted_index()
    stop_inv_index = stopped_corpus[0]
    stop_total_corpus = stopped_corpus[1]
    fa = FileAccess()
    r = Retriever()
    query_dict = fa.read_queries()
    relevance_data = fa.get_relevance_data()
    stopped_queries = stop.get_stopped_queries(query_dict)
    r.run_all_queries(inverted_index=stop_inv_index,
                      total_corpus=stop_total_corpus,
                      relevance_data=relevance_data,
                      query_dict=stopped_queries,
                      model=model,
                      task_id="3a",
                      notes="stopped",
                      store_queries='stopped')
Beispiel #21
0
def get_clusters():
  html = '<html><body><h3>HN Cluster groups</h3>'
  r = Retriever()
  allwords, index, doc_to_title = r.retrieve()
  c = Clustering()
  root, cluster_doc_map = c.hcluster(allwords, index)
  relevant_clusters = c.subclusters(root, 0.90)
  singles = []
  for cluster in relevant_clusters:
    item_c = c.subcluster_items(cluster, cluster_doc_map, doc_to_title)
    if len(item_c) == 1: singles.append(item_c[0]); continue
    for item in item_c:
      html += '<a href="%s">%s</a><br>' % (doc_to_title[cluster_doc_map[item]][1],
                                            doc_to_title[cluster_doc_map[item]][0])
    html += '<hr><br><br>'
  html += '<h3>Single clusters</h3>'
  for item in singles:
    html += '<a href="%s">%s</a><br>' % (doc_to_title[cluster_doc_map[item]][1],
                                          doc_to_title[cluster_doc_map[item]][0])
  html += '</body></html>'
Beispiel #22
0
def snippet_generation():
    r = Retriever()
    fa = FileAccess()
    query_dict = fa.read_queries()
    query_id = raw_input('Enter the query_id: \n')
    if int(query_id) > 64 or int(query_id) < 1:
        print 'No Query exists, please enter between 1 to 64'
        return
    query = query_dict[int(query_id) - 1]
    print 'Query: ' + query
    fa = FileAccess()
    relevance_data = fa.get_relevance_data()
    corpus = r.get_corpus(True)
    inverted_index = corpus[0]
    total_corpus = corpus[1]

    results = r.run_all_queries(inverted_index=inverted_index,
                                total_corpus=total_corpus,
                                relevance_data=relevance_data,
                                query_dict=query_dict,
                                model='cosine',
                                task_id="1",
                                notes='',
                                ret=True)

    results = results[0:4]
    snippet_dictionary = {}

    for each in results:
        docid = each[1]
        data = total_corpus[docid]
        data = " ".join(data)
        sg = SnippetGenerator()
        snippet = sg.generate_snippet(data, query)
        snippet_dictionary[docid] = snippet

    print '\n'
    for each in results:
        print 'Doc-Id: ' + each[1]
        print snippet_dictionary[each[1]]
        print '\n'
Beispiel #23
0
 def __init__(self,
              ret=Retriever(),
              track_name='Radioactive',
              alg_type='affprop'):
     self.ret = ret  # initalize the Retriever class
     self.dataset = self.ret.retrieve(track_name)  # get the data set
     self.data = normalize(self.dataset.data.tolist())  # normalize the data
     self.track_ids = self.dataset.labels.tolist()  # get the track ids
     self.reduced_data = self.reduce_data()  # reduce the data
     self.alg_type = alg_type  # get the algorithm type
     self.alg = self.run_clustering()  # setup the clustering
     self.clusters = self.alg.fit(self.reduced_data)  # cluster the data
Beispiel #24
0
def task2(model):
    fa = FileAccess()
    r = Retriever()
    query_dict = fa.read_queries()
    corpus = r.get_corpus(True)
    inverted_index = corpus[0]
    total_corpus = corpus[1]
    relevance_data = fa.get_relevance_data()
    task1_folder = os.path.join(os.getcwd(), 'task1')
    file_name = "task1_" + model + "_.txt"
    result_file = task1_folder + '/' + file_name
    qe = QueryExpander(query_dict=query_dict, filename=result_file, clean=True)
    expanded_queries = qe.get_expanded_queries()
    r.run_all_queries(inverted_index=inverted_index,
                      total_corpus=total_corpus,
                      relevance_data=relevance_data,
                      query_dict=expanded_queries,
                      model='cosine',
                      task_id="2",
                      notes="expanded",
                      store_queries='expanded')
Beispiel #25
0
 def __init__(self):
     self.visited_links = []
     self.links_queue = deque([])
     self.domain = ''
     self.same_domain = True
     self.pageRetriever = Retriever()
     self.downloader = Downloader()
     self.linkanalyser = linkanalyser.LinkAnalyzer()
     logging.basicConfig(
         filename = 'crawler.log', 
         format = '%(levelname)s:%(message)s', 
         level = logging.INFO
     )
Beispiel #26
0
def get_clusters():
    html = '<html><body><h3>HN Cluster groups</h3>'
    r = Retriever()
    allwords, index, doc_to_title = r.retrieve()
    c = Clustering()
    root, cluster_doc_map = c.hcluster(allwords, index)
    relevant_clusters = c.subclusters(root, 0.90)
    singles = []
    for cluster in relevant_clusters:
        item_c = c.subcluster_items(cluster, cluster_doc_map, doc_to_title)
        if len(item_c) == 1:
            singles.append(item_c[0])
            continue
        for item in item_c:
            html += '<a href="%s">%s</a><br>' % (
                doc_to_title[cluster_doc_map[item]][1],
                doc_to_title[cluster_doc_map[item]][0])
        html += '<hr><br><br>'
    html += '<h3>Single clusters</h3>'
    for item in singles:
        html += '<a href="%s">%s</a><br>' % (doc_to_title[
            cluster_doc_map[item]][1], doc_to_title[cluster_doc_map[item]][0])
    html += '</body></html>'
Beispiel #27
0
def getData():
    types = request.args.get('type')
    if types == '1':
        words = request.args.get('words')
        if len(words) == 0:
            response = initialPos
        else:
            pos = word_to_pos[words]
            pos = [pos_to_hanzi[each] for each in pos]
            response = pos
        return jsonify(response)
    else:
        choice_list = ['bert', 'word2vec', 'none']
        choice_index = int(request.args.get('choice'))
        choice = choice_list[choice_index]

        query_list = request.args.get('words').split(',')
        sen_list, ans_pos_list = se.query([each for each in query_list if len(each) > 0])
        pos_list = request.args.get('posList').split(',')
        pos_list = [hanzi_to_pos[each] for each in pos_list]
        relative_list = request.args.get('relative').split(',')
        relativeObject_list = request.args.get('relativeObject').split(',')
        re = Retriever(sen_list, query_list, pos_list_document=ans_pos_list, pos_list_query=pos_list, relative=relative_list, relativeObject=relativeObject_list)
        response = re.rank()
        if list(set(pos_list)) != ['all']:
            response = re.filter_by_pos()
        if list(set(relative_list)) != ['不限']:
            response = re.filter_relative()
        response = ["".join(each) for each in response]
        if choice == 'none':
            return jsonify(response[:50])
        elif choice == 'bert':
            new_sen = bertranker.rank(query_list, response[:50])
            return jsonify(new_sen)
        else:
            new_sen = word2vecranker.rank(query_list, response[:50])
            return jsonify(new_sen)
Beispiel #28
0
    def on_get(self, req, resp):

        validate_params = True
        resp.status = falcon.HTTP_200
        if 'base_url' not in req.params:
            validate_params = False
        if 'category' not in req.params:
            validate_params = False

        num_books = 3

        if (validate_params is True):

            connect(database_name)
            books = Retriever.retrieve_books(req.params['base_url'],
                                             req.params['category'], num_books)
            Operations.insertBooks(books)

            resp.body = dumps("Inserted books in database")
            resp.status = falcon.HTTP_200
Beispiel #29
0
def main():
    args = parser.parse_args()

    # Define all command-line mutable arguments
    thread_amount = int(args.threads) if args.threads else 2
    timestamp = int(args.start) if args.start else 1420070400000
    max_retries = int(args.retries) if args.retries else 5

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s - %(name)s: %(message)s')
    file_handler = logging.FileHandler('scraper.log')
    file_handler.setFormatter(formatter)

    logger.addHandler(file_handler)

    logger.debug('Started downloading items.')

    scraper = cfscrape.create_scraper()
    items = cached(scraper, '.data/items.json', 'https://rsbuddy.com/exchange/names.json')

    # Write caching file to .data folder
    write_file('.data/items.json', json.dumps(items))

    logger.debug('Finished downloading items.')

    # Divide the items into buckets for the retrievers to
    # process
    item_ids = list(items.keys())
    shuffle(item_ids)
    item_id_buckets = np.array_split(item_ids, thread_amount)

    logger.debug('Started downloading item history.')

    # Create all retrievers and run them as a seperate
    # thread
    threads = []
    for item_id_bucket in item_id_buckets:
        thread = Retriever(item_id_bucket, timestamp, cfscrape.create_scraper(), max_retries, logger)
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    logger.debug('Scraper finished.')
Beispiel #30
0
class TestRetrieverMethods(unittest.TestCase):

    def setUp(self):
        self.retriever = Retriever()
        self.output = []

        def fake_print(content):
            self.output.append(content)

        self.retriever.print = fake_print

        self.text_response = '{"success": true, "data": [209], "length": 1, "type": "uint8"}'
        self.json_response = {'type': 'uint8', 'data': [209], 'length': 1, 'success': True}

    def tearDown(self):
        self.output = None
        self.retriever = None

    def test_fetch(self):
        self.retriever.render = MagicMock()

        # Check empty url
        response = self.retriever.fetch("")
        self.assertIsNone(response)
        self.retriever.render.assert_not_called()

        # Check invalid url
        response = self.retriever.fetch("Invalidurl")
        self.assertIsNone(response.value)  # Still None, still loading
        response.join()  # Force the response to be retrieved
        self.assertIsInstance(response.value.exception, Exception)
        self.retriever.render.assert_not_called()

    def test_process_response(self):
        response = Mock()
        response.status_code = 200
        response.text = self.text_response

        processed_response = self.retriever.process_response(response)
        self.assertEqual(response, processed_response)
        self.assertEqual(len(self.output), 3)
        self.assertEqual(self.output[2], 209)  # The single value we have in our data

        self.output = []  # Reset output

        empty_response = Mock()
        empty_response.status_code = 400
        self.retriever.process_response(empty_response)
        self.assertEqual(len(self.output), 2)
        self.assertEqual(self.output[1], "Response not valid. Status Code 400")

    def test_render(self):
        self.retriever.render(self.json_response)
        self.assertEqual(len(self.output), 2)
        self.assertEqual(self.output[1], 209)  # The single value we have in our data

        self.output = []

        self.retriever.render(self.text_response)  # Bad response. Still a text
        self.assertEqual(len(self.output), 1)
        self.assertEqual(self.output[0], "Response not valid, please provide a valid API")
Beispiel #31
0
 def __init__(self):
     shelve('database1', 'c')
     self.term_extractor = parser.ExtractTerms()
     self.retriever = Retriever()
Beispiel #32
0
def main():
    retriever = Retriever()
    retriever.start()
 def __init__(self):
     shelve("database1", "c")
     self.term_extractor = parser.ExtractTerms()
     self.retriever = Retriever()
Beispiel #34
0
from flask import Flask, request
from retriever import Retriever
from flask import jsonify

app = Flask(__name__)
retriever = Retriever()


@app.route('/weather/data')
def data():
    lat = float(request.args.get('lat'))
    lon = float(request.args.get('lon'))
    return jsonify(retriever.get_data_for_location(lat, lon))


@app.route('/weather/summarize')
def summarize():
    lat = float(request.args.get('lat'))
    lon = float(request.args.get('lon'))
    return jsonify(retriever.get_summary_for_location(lat, lon))


if __name__ == '__main__':
    app.run()
Beispiel #35
0
from parser import PlayParser
from retriever import Retriever
import sqlite3

print("Starting...")
retriever = Retriever()

def query(c, sql, args=[]):
    print(sql)
    c.execute(sql, args)

latest_files = retriever.latest_two_filenames()
latest_dates = [retriever.today, retriever.yesterday]

for (filename, date) in zip(latest_files, latest_dates):
    print ("Importing {0} for date {1}".format(filename, date.strftime("%Y%m%d")))
    parser = PlayParser(date, filename)
    plays = parser.parse()

    conn = sqlite3.connect('db/top40.db')

    for play in plays:
        c = conn.cursor()

        # find/insert artist
        query(c, 'select artist_id from artists where name=?', [play.artist])
        rows = c.fetchall()
        print(len(rows))
        if len(rows) < 1:
            query(c, 'insert into artists (name) values (?)', [play.artist])
            conn.commit()
		fo = open(fileName, "r")
	except IOError:
		print "File", fileName, "cannot be read"
		sys.exit()
	return fo
	
# Main function from where program starts
if __name__ == '__main__':
	
	# Open the predefined problem domain related info file here
	fo = openFile('static_value.txt')	
	exec('content=' + fo.read())
	
	# Create the Retriever module, given case based memory file and 
	# problem domain related info file 
	rt = Retriever('generated_plan.txt', 'state_similarity.txt')
	
	# Create a current state and goal state to test our program
	
	# Create the current state here
	currentState = State('current')
	currentState.target = 'me'
	currentState.loc = {'me':'IU Campus', 'bus':'bus_stop_2'}
	currentState.cash = {'me':3050}	
	currentState.balance_bank = {'me':120}
	currentState.time = {'me':1820}
	currentState.nearest_bus_stop={'me': 'bus_stop_2'}
	# Put the domain related info into the state
	for stateItem in content:				
		exec("currentState."+stateItem)
	
Beispiel #37
0
                          model_args.ff_embed_dim, model_args.num_heads,
                          model_args.dropout, model_args.enc_layers,
                          model_args.dec_layers, model_args.label_smoothing)
    elif model_args.arch == 'mem':
        model = MemGenerator(vocabs, model_args.embed_dim,
                             model_args.ff_embed_dim, model_args.num_heads,
                             model_args.dropout, model_args.mem_dropout,
                             model_args.enc_layers, model_args.dec_layers,
                             model_args.mem_enc_layers,
                             model_args.label_smoothing,
                             model_args.use_mem_score)
    elif model_args.arch == 'rg':
        retriever = Retriever.from_pretrained(
            model_args.num_retriever_heads,
            vocabs,
            args.index_path if args.index_path else model_args.retriever,
            model_args.nprobe,
            model_args.topk,
            args.device,
            use_response_encoder=(model_args.rebuild_every > 0))
        model = RetrieverGenerator(
            vocabs, retriever, model_args.share_encoder, model_args.embed_dim,
            model_args.ff_embed_dim, model_args.num_heads, model_args.dropout,
            model_args.mem_dropout, model_args.enc_layers,
            model_args.dec_layers, model_args.mem_enc_layers,
            model_args.label_smoothing)

        if args.hot_index is not None:
            model.retriever.drop_index()
            torch.cuda.empty_cache()
            model.retriever.update_index(args.hot_index, model_args.nprobe)
Beispiel #38
0
    def __init__(self, args, wikidata):
        self.data_path = "data/{}/{}-{}.qa.json".format(
            args.data, args.data, args.data_type)
        self.tagme_path = "data/{}/{}-{}.tagme.json".format(
            args.data, args.data, args.data_type)

        self.wikidata = wikidata
        self.retriever = Retriever(args)

        with open(self.data_path, 'r') as f:
            orig_data = json.load(f)

        with open(self.tagme_path, 'r') as f:
            tagme_data = json.load(f)

        assert len(orig_data) == len(tagme_data)
        print("Loaded {} QA data".format(len(orig_data)))

        self.save_path = "data/{}/{}-{}.retrieved.json".format(
            args.data, args.data, args.data_type)

        #### data to save ###
        n_cross_relations = []
        n_inner_relations = []
        n_total_relations = []
        data_to_save = []

        N_TFIDF = 5 if args.data == "webquestions" else 10
        N_BM25 = 40 if args.data == "webquestions" else 80

        for i, (d, tags) in tqdm(enumerate(zip(orig_data, tagme_data))):
            if len(tags) > 0:
                sorted_tags = sorted(tags, key=lambda x: -x['score']
                                     ) if 'score' in tags[0] else tags.copy()
                tags = []
                for e in sorted_tags:
                    # for some reason, tagme keeps tagging "The Who" for "who" questions.
                    # we will exclude them.
                    if not (
                        (e['entity'] == 'The Who' and e['mention'] == 'who')
                            or e["entity"] == "song"):
                        if e['entity'] not in tags:
                            tags.append(e['entity'])

            tfidf_docs = self.retriever.get_titles_from_query(
                d['question'], N_TFIDF)
            for t in tfidf_docs:
                if t not in tags:
                    tags.append(t)
            keywords = self.wikidata.populate(tags,
                                              k=args.n_hops,
                                              use_aliases=False)
            collected_docs = set()
            collected_paragraphs = []
            paragraphs_to_run_bm25 = []
            for (doc_name, hop, relation) in keywords[:80]:
                if doc_name in collected_docs:
                    continue
                collected_docs.add(doc_name)
                contents = self.retriever.get_contents_from_title(
                    doc_name,
                    n_words=self.retriever.get_n_words(d['question'],
                                                       doc_name),
                    only_first=hop > 0)
                if len(contents) == 0:
                    continue
                collected_paragraphs.append((contents[0], hop, relation))
                assert hop == 0 or len(contents) == 1
                paragraphs_to_run_bm25 += [(content, relation)
                                           for content in contents[1:]]

            collected_paragraphs = [
                par
                for i, par in sorted(enumerate(collected_paragraphs),
                                     key=lambda x: (x[1][1], x[1][0][1], x[0]))
            ]
            bm25_paragraphs = self.retriever.get_paragraphs_from_documents(
                d['question'],
                paragraphs_to_run_bm25,
                N_BM25,
                only_first=False,
                is_tuple=True)
            pars = [(par, rel) for par, hop, rel in collected_paragraphs
                    if hop == 0]
            pars_1 = [(par, rel) for par, hop, rel in collected_paragraphs
                      if hop == 1]
            for p_i in range(len(bm25_paragraphs)):
                if len(pars_1) > p_i:
                    pars.append(pars_1[p_i])
                pars.append(bm25_paragraphs[p_i])
            pars += self.retriever.get_paragraphs_from_documents(
                d['question'],
                pars_1[len(bm25_paragraphs):],
                100,
                only_first=False,
                is_tuple=True)
            pars += self.retriever.get_paragraphs_from_documents(
                d['question'],
                [(par, rel)
                 for par, hop, rel in collected_paragraphs if hop > 1],
                100,
                only_first=False,
                is_tuple=True)
            # truncate pars to be 100 at maximum
            pars = pars[:100]

            relations = [p[1] for p in pars]
            pars = [p[0] for p in pars]

            # get graph information for the GrpahReader
            collected_docs = set([par[0] for par in pars])
            graph = self.wikidata.get_graph(collected_docs)
            constructed_graph = {}
            n_cross, n_inner = 0, 0
            for i1, (title1, index1, _) in enumerate(pars):
                for i2, (title2, index2, _) in enumerate(pars):
                    if i1 == i2: continue
                    if (title1, title2) in graph and index1 == index2 == 0:
                        constructed_graph[(i1, i2)] = graph[(title1, title2)]
                        n_cross += 1
                    if title1 == title2 and index1 == 0 and index2 > 0:
                        constructed_graph[(i1, i2)] = ["<CHILD_PARAGRAPH>"]
                        constructed_graph[(i2, i1)] = ["<PARENT_PARAGRAPH>"]
                        n_inner += 2
            n_cross_relations.append(n_cross)
            n_inner_relations.append(n_inner)
            n_total_relations.append(n_cross + n_inner)
            data_to_save.append(
                json.dumps({
                    'question': d['question'],
                    'answers': d['answers'],
                    'paragraphs': pars,
                    'graph': {
                        '{} {}'.format(k[0], k[1]): v
                        for k, v in constructed_graph.items()
                    }
                }))

        print("Cross", np.mean(n_cross_relations))
        print("Inner", np.mean(n_inner_relations))
        print("Total", np.mean(n_total_relations))

        with open(self.save_path, 'w') as f:
            f.write("\n".join(data_to_save))
Beispiel #39
0
class Eris:
    def __init__(self):
        self.output = os.path.join(config.logsDir, "output.log")

    def start(self):
        try:
            with open(config.statusFile, "r") as f:
                if f.read() == config.STATUS_LINE:
                    return
        except:
            pass
        try:
            with open(config.statusFile, "w") as f:
                f.write(config.STATUS_LINE)
        except IOError as e:
            print >>sys.stderr, e
            return

        if self.daemonize():
            return

        self.startTime = datetime.now()
        self.storage = Storage()
        self.btserver = BtServer(self.storage)
        self.retriever = Retriever(self.storage)
        self.btserver.start()
        self.retriever.start()

        daemon = Pyro4.Daemon(port=config.pyroPort)
        uri = daemon.register(self, config.PNAME)
        log.info("Eris daemon URI: [{}]".format(uri))
        self.running = True
        daemon.requestLoop(loopCondition=lambda: self.running)

        daemon.unregister(config.PNAME)
        daemon.close()

    def stop(self):
        log.info("Closing eris")
        try:
            self.btserver.kill()
            self.retriever.kill()
            self.retriever.join(1.0)
            self.btserver.join(1.0)

            with open(config.statusFile, "w"):
                pass
        except:
            log.exception("Something went wrong while shutting down")
        self.running = False

    def status(self):
        pid = os.getpid()
        proc = psutil.Process(pid)
        cpu = proc.get_cpu_percent()
        mem = proc.get_memory_percent()
        uptime = datetime.now() - self.startTime
        du = self.storage.size()
        return (pid, cpu, mem, uptime, du)

    def put(self, packets):
        self.storage.put(packets)

    def get(self, since=0, to=0, limit=0):
        connId, _ = self.storage.get(since, to, limit)
        return self.storage.fetchall(connId)

    def count(self):
        return self.storage.rowcount()

    def ping(self):
        return config.PNAME

    def daemonize(self):
        try:
            pid = os.fork()
            if pid > 0:
                return True
        except OSError as e:
            sys.stderr.write("Fork #1 failed: {} ({})\n".format(e.errno, e.strerror))
            sys.exit(1)

        os.chdir("/")
        os.setsid()

        try:
            pid = os.fork()
            if pid > 0:
                sys.exit(0)
        except OSError as e:
            sys.stderr.write("Fork #2 failed: {} ({})\n".format(e.errno, e.strerror))
            sys.exit(1)

        sys.stdout.flush()
        sys.stderr.flush()
        with open(self.output, "w"):
            pass
        out = file(self.output, "a+", 1)
        os.dup2(out.fileno(), sys.stdout.fileno())
        os.dup2(out.fileno(), sys.stderr.fileno())
        return False

    @staticmethod
    def getProxy():
        uri = "PYRO:{}@localhost:{}".format(config.PNAME, config.pyroPort)
        return Pyro4.Proxy(uri)
Beispiel #40
0
class Crawler(object):
    """crawler goes out to the web and downloads the web pages
    """
    _invalidExt = [
        '.pdf', 'jpg', 'jpeg', '.doc', 'docx', 
        '.gif', '.zip', '.rar', '.PDF'
    ]
    def __init__(self):
        self.visited_links = []
        self.links_queue = deque([])
        self.domain = ''
        self.same_domain = True
        self.pageRetriever = Retriever()
        self.downloader = Downloader()
        self.linkanalyser = linkanalyser.LinkAnalyzer()
        logging.basicConfig(
            filename = 'crawler.log', 
            format = '%(levelname)s:%(message)s', 
            level = logging.INFO
        )

         
            
    def crawlPage(self, url, same_domain = True):
        
        retrieverResponse = self.downloader.CDownload(url)
        
        if retrieverResponse == 0:
            print retrieverResponse, "Invalid Url.....parsing skipped\n"
            return
        
        self.visited_links.append(url)
        
        try:
            links = self.pageRetriever.getLinks(url)
            self.linkanalyser.analyze(url, links)
        except Exception:
            return
            
        for link in links:
            if link not in self.visited_links:
                if same_domain == True:
                    if urlparse(link)[1] != self.domain:
                        #print link, " *** discarded for crawl .. not in domain"
                        logging.info("%s * discarded for crawl .. not in domain"%link)
                    else:
                        if link not in self.links_queue:
                            if splitext(link)[1] not in self._invalidExt:
                                self.links_queue.append(link)
                                #print link, " *** new link added to crawl queue"
                                logging.info("%s * new link added to crawl queue"%link)
                        else:
                            #print link,"*** discarded already visited"
                            logging.info("%s * discarded already visited"%link)
                    
                if same_domain == False:
                    if link not in self.links_queue:
                        self.links_queue.append(link)
                        #print link," *** new link added to crawl queue"
                        logging.info("%s * new link added to crawl queue"%link)
                    else:
                        #print link,"*** discarded already visited"
                        logging.info("%s *** discarded already visited"%link)
                      
        print "length of queue is ", len(self.links_queue), "len of visited queue is ", \
            len(self.visited_links)
        logging.info("length of queue is %d   length of visited queue is %d"\
            %(len(self.links_queue), len(self.visited_links)))            
                    
                    
    def start_crawl(self, url, same_domain = True):
        self.links_queue.append(url)
        self.domain = urlparse(url)[1] 
        self.same_domain = same_domain              # process links in queue
        while self.links_queue:
            url = self.links_queue.popleft()
            self.crawlPage(url)