def start(self): ### IMPORTS self.retriever = Retriever() self.time = __import__('time') self.json = __import__('json') self.re = __import__('re') ### INTRO ART rows, columns = os.popen('stty size', 'r').read().split() asci = "" f = open('assets/logo.txt', 'r') logoArt = f.read() f.close() for line in logoArt.split('\n'): #asci=asci+'\n'+' '*(int(0.5*int(columns) )-int(0.5*len(line)))+line asci = asci + '\n' + Fore.YELLOW + ' ' * 24 + line asci += '\n' asci += '\n' + ' ' * (int(0.5 * int(columns)) - int(0.5 * len('InstaMiner'))) + 'InstaMiner' asci += '\n' + ' ' * (int(0.5 * int(columns)) - int(0.5 * len('v' + self.v))) + 'v' + self.v asci += '\n\n' + ' ' * (int(0.5 * int(columns)) - int( 0.5 * len('Type "help" for help'))) + 'Type "help" for help' self.intro = asci ### INIT self._buffer_start() self.cmdloop()
def main(args, local_rank=0): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) vocabs = dict() vocabs['src'] = Vocab(args.src_vocab, 0, [BOS, EOS]) vocabs['tgt'] = Vocab(args.tgt_vocab, 0, [BOS, EOS]) logger.info(args) for name in vocabs: logger.info("vocab %s, size %d, coverage %.3f", name, vocabs[name].size, vocabs[name].coverage) set_seed(19940117) #device = torch.device('cpu') torch.cuda.set_device(local_rank) device = torch.device('cuda', local_rank) logger.info("start building model") logger.info("building retriever") if args.add_retrieval_loss: retriever, another_model = Retriever.from_pretrained( args.num_retriever_heads, vocabs, args.retriever, args.nprobe, args.topk, local_rank, load_response_encoder=True) matchingmodel = MatchingModel(retriever.model, another_model) matchingmodel = matchingmodel.to(device) else: retriever = Retriever.from_pretrained(args.num_retriever_heads, vocabs, args.retriever, args.nprobe, args.topk, local_rank) logger.info("building retriever + generator") model = RetrieverGenerator(vocabs, retriever, args.share_encoder, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.mem_dropout, args.enc_layers, args.dec_layers, args.mem_enc_layers, args.label_smoothing) model = model.to(device) model.eval() dev_data = DataLoader(vocabs, args.dev_data, args.dev_batch_size, for_train=False) bleu = validate(device, model, dev_data, beam_size=5, alpha=0.6, max_time_step=10)
def phase2(model): stop = Stopper() stopped_corpus = stop.build_stopped_inverted_index() stop_inv_index = stopped_corpus[0] stop_total_corpus = stopped_corpus[1] task3a_folder = os.path.join(os.getcwd(), 'task3a') file_name = "task3a_cosine_stopped.txt" r = Retriever() fa = FileAccess() relevance_data = fa.get_relevance_data() query_dict = fa.read_queries() result_file = task3a_folder + '/' + file_name stopped_queries = stop.get_stopped_queries(query_dict) qe = QueryExpander(query_dict=stopped_queries, filename=result_file, clean=False) expanded_stopped_queries = qe.get_expanded_queries() r.run_all_queries(inverted_index=stop_inv_index, total_corpus=stop_total_corpus, relevance_data=relevance_data, query_dict=expanded_stopped_queries, model=model, task_id="phase2", notes="stopped_expanded", store_queries='stopped_expanded')
def __init__(self): self.retriever = Retriever() #to use the filename function self.headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Accept' : 'text/xml,application/xml,application/xhtml+xml,\ text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language' : 'fr-fr,en-us;q=0.7,en;q=0.3', 'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7' }
def result_handler(): """ Post handler for request data Adds request data into sqlite database First pulls data in database, but if request ingredients are not in database ingredients, pulls from request URL and adds to database """ desired_handlers = ['allrecipes', 'foodnetwork', 'epicurious'] user_ingredients = [request.form[task] for task in request.form] ingredients = [] for i in sorted(user_ingredients): a = i.lower() j = a.replace(' ', '') ingredients.append(j) exists = DBSession.query(Recipes).get(str(ingredients)) if exists: ar_recipe_urls = ast.literal_eval(exists.ar_recipe_url) ar_recipe_names = ast.literal_eval(exists.ar_recipe_name) fn_recipe_urls = ast.literal_eval(exists.fn_recipe_url) fn_recipe_names = ast.literal_eval(exists.fn_recipe_name) ep_recipe_urls = ast.literal_eval(exists.ep_recipe_url) ep_recipe_names = ast.literal_eval(exists.ep_recipe_name) else: ar_recipe_links = list( Retriever(ingredients)(desired_handlers)[0]['allrecipes']) ar_recipe_urls = ar_recipe_links[0] ar_recipe_names = ar_recipe_links[1] fn_recipe_links = list( Retriever(ingredients)(desired_handlers)[0]['foodnetwork']) fn_recipe_urls = fn_recipe_links[0] fn_recipe_names = fn_recipe_links[1] ep_recipe_links = list( Retriever(ingredients)(desired_handlers)[0]['epicurious']) ep_recipe_urls = ep_recipe_links[0] ep_recipe_names = ep_recipe_links[1] ingredient_add = Recipes(ingredients=str(ingredients), ar_recipe_url=str(ar_recipe_urls), ar_recipe_name=str(ar_recipe_names), fn_recipe_url=str(fn_recipe_urls), fn_recipe_name=str(fn_recipe_names), ep_recipe_url=str(ep_recipe_urls), ep_recipe_name=str(ep_recipe_names)) DBSession.add(ingredient_add) DBSession.commit() return ar_recipe_urls, ar_recipe_names, fn_recipe_urls, fn_recipe_names, ep_recipe_urls, ep_recipe_names
def setUp(self): self.retriever = Retriever() self.output = [] def fake_print(content): self.output.append(content) self.retriever.print = fake_print self.text_response = '{"success": true, "data": [209], "length": 1, "type": "uint8"}' self.json_response = {'type': 'uint8', 'data': [209], 'length': 1, 'success': True}
class LinkAnalyzer(object): """creates abstract documents and feeds their attributes """ def __init__(self): shelve('database1', 'c') self.term_extractor = parser.ExtractTerms() self.retriever = Retriever() def analyze(self, url, links): """creates a document and sets its outgoing links """ self.db = shelve('database1', 'w') key = md5(url).hexdigest() #if the document is already in the database, just add its outgoing links if key in self.db.iterkeys(): doc = self.db[key] doc.insertOL(links) doc.url = url document = open(self.retriever.filename(url)).read() doc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) doc.unique_terms_freq = self.term_extractor.count_term_frequencies( unique_terms, document) #print self.db[key].outgoingLinks #if there is no document for the url, create a document and add its outgoing links if key not in self.db.iterkeys(): newDoc = Document(url) newDoc.insertOL(links) newDoc.url = url document = open(self.retriever.filename(url)).read() newDoc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies( unique_terms, document) self.db[key] = newDoc #print self.db[key].outgoingLinks #self.extractLinksfromResponse(url,links) self.db.close() def extractLinksfromResponse(self, url, links): """analyses the incoming links from the response """ for link in links: key = md5(link).hexdigest() if key in self.db.iterkeys(): doc = self.db[key] doc.insertIL(url) else: newDo = Document(link) newDo.insertIL(url) #print type(newDo) #print type(key) self.db[key] = newDo
def __init__(self): self.visited_links = [] self.links_queue = deque([]) self.domain = '' self.same_domain = True self.pageRetriever = Retriever() self.downloader = Downloader() self.linkanalyser = linkanalyser.LinkAnalyzer() logging.basicConfig(filename='crawler.log', format='%(levelname)s:%(message)s', level=logging.INFO)
class LinkAnalyzer(object): """creates abstract documents and feeds their attributes """ def __init__(self): shelve("database1", "c") self.term_extractor = parser.ExtractTerms() self.retriever = Retriever() def analyze(self, url, links): """creates a document and sets its outgoing links """ self.db = shelve("database1", "w") key = md5(url).hexdigest() # if the document is already in the database, just add its outgoing links if key in self.db.iterkeys(): doc = self.db[key] doc.insertOL(links) doc.url = url document = open(self.retriever.filename(url)).read() doc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) doc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document) # print self.db[key].outgoingLinks # if there is no document for the url, create a document and add its outgoing links if key not in self.db.iterkeys(): newDoc = Document(url) newDoc.insertOL(links) newDoc.url = url document = open(self.retriever.filename(url)).read() newDoc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document) self.db[key] = newDoc # print self.db[key].outgoingLinks # self.extractLinksfromResponse(url,links) self.db.close() def extractLinksfromResponse(self, url, links): """analyses the incoming links from the response """ for link in links: key = md5(link).hexdigest() if key in self.db.iterkeys(): doc = self.db[key] doc.insertIL(url) else: newDo = Document(link) newDo.insertIL(url) # print type(newDo) # print type(key) self.db[key] = newDo
def __init__(self, filename, query_dict, top_k=12, n=5, clean=True): r = Retriever() if not clean: self.total_corpus = r.get_total_corpus(folder='stopped') else: self.total_corpus = r.get_total_corpus(folder='clean') self.k = top_k self.n = n fa = FileAccess() self.query_dict = query_dict self.results = fa.read_result_file(filename=filename) return
def retrieve_remote_video(self, records): """ Retrieve video from remote server. """ for key, val in records.items(): torrent_path = (self.DEFAULT_REMOTE_VIDEO_DIR+val).encode('utf-8') #existence = exists(torrent_path, use_sudo=False, verbose=True) #if not existence: # print (red('Downloaded video not exists on path - %s' %torrent_path)) # continue retriever = Retriever(torrent_path) retriever.start() self.stop_seeding(key)
class Downloader(object): """There are two downloaders download() uses the urllib2 module CDownload() uses curl to download the pages. This results in fast page downloads """ def __init__(self): self.retriever = Retriever() #to use the filename function self.headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Accept' : 'text/xml,application/xml,application/xhtml+xml,\ text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language' : 'fr-fr,en-us;q=0.7,en;q=0.3', 'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7' } def download(self,url): """downloads the webpage indicated by url and saves it in a file with an absolute path as that of the url """ reqObj = urllib2.Request(url, None, self.headers) try: urlObj = urllib2.urlopen(reqObj) response = urlObj.readlines() except Exception: return #write the content of the response object to the file file = open(self.retriever.filename(url), 'w') for line in response: file.writelines(line) print url + "**** crawled" logging.info("* crawled %s \n"%url) file.close() return 1 def CDownload(self, url): try: file_name = self.retriever.filename(url) #curl downloads the file and writes it into a file os.system("curl %s -o %s"%(url, file_name)) print url + "**** crawled" logging.info("* crawled %s \n"%url) response=open(file_name, "r").read() except IOError: return 0 return response
def __init__(self, config, config_key): self.config = config self.config_key = config_key # For Understanding User Inputs self.input_yes_or_no = json.load( open(config[config_key]['input_yes_or_no'])) self.input_about = json.load(open(config[config_key]['input_about'])) self.location_file_path = config[config_key]['location'] self.cuisine_file_path = config[config_key]['cuisine'] self.food_file_path = config[config_key]['food'] # For Generating Responses self.response_greetings = json.load( open(config[config_key]['response_greetings'])) self.response_yes_no = json.load( open(config[config_key]['response_yes_no'])) self.response_about = json.load( open(config[config_key]['response_about'])) self.response_general = json.load( open(config[config_key]['response_general'])) self.response_for_business = json.load( open(config[config_key]['response_for_business'])) # Helpers self.retriever = Retriever(config, config_key) # read in valid locations with open(self.location_file_path, 'r') as location_file: self.valid_locations = [ location.lower() for location in location_file.read().splitlines() ] # read in cuisines with open(self.cuisine_file_path, 'r') as cuisine_file: self.known_cuisines = [ cuisine.lower() for cuisine in cuisine_file.read().splitlines() ] # read in food with open(self.food_file_path, 'r') as food_file: self.known_foods = [ food.lower() for food in food_file.read().splitlines() ] self.state_after_response = State.understood_nothing
def main(): '''Things to do when this module is called''' r = Retriever() c = raw_input('Enter a search term (q to quit): ') ask = True while not should_terminate(c): i = 0 ask = True print('\n############ RESULTS ############\n') link_pq = get_links(c, r) for tuple in link_pq: link = tuple[1] if print_url(link): i += 1 if i != 0 and i % 10 == 0: print('#################################\n') print("Press Enter to list more results.") c = raw_input('Enter a search term (q to quit): ') if c != '': ask = False break print('\n#################################\n') if ask: print('#################################\n') c = raw_input('Enter a search term (q to quit): ') print('\n############\n# Goodbye! #\n############\n')
def start(self): try: with open(config.statusFile, "r") as f: if f.read() == config.STATUS_LINE: return except: pass try: with open(config.statusFile, "w") as f: f.write(config.STATUS_LINE) except IOError as e: print >>sys.stderr, e return if self.daemonize(): return self.startTime = datetime.now() self.storage = Storage() self.btserver = BtServer(self.storage) self.retriever = Retriever(self.storage) self.btserver.start() self.retriever.start() daemon = Pyro4.Daemon(port=config.pyroPort) uri = daemon.register(self, config.PNAME) log.info("Eris daemon URI: [{}]".format(uri)) self.running = True daemon.requestLoop(loopCondition=lambda: self.running) daemon.unregister(config.PNAME) daemon.close()
def load_retriever(args, device, task_tokenizer, retriever_tokenizer, finetuned_path=None, stored_index=None, train_use_idx=None): print( f"\nLoading retriever: {finetuned_path if finetuned_path is not None else args.retrieval_model}\n" ) config = AutoConfig.from_pretrained(args.retrieval_model) config.__dict__.update(args.__dict__) model = Retriever.from_pretrained(args.retrieval_model, config=config, cache_dir=args.cache_dir, task_tokenizer=task_tokenizer, retriever_tokenizer=retriever_tokenizer, stored_index=stored_index, train_use_idx=train_use_idx) model.resize_token_embeddings(len(retriever_tokenizer)) if args.reinitialize_retriever and finetuned_path is None: model.init_weights() if finetuned_path is not None: model_state_dict = torch.load( finetuned_path, map_location=lambda storage, loc: storage ) # args for preventing memory leakage across gpus utils.rectify_mismatched_embeddings(model, model_state_dict, retriever_tokenizer) model.load_state_dict(model_state_dict) model = model.to(device) return model
def get_model_and_tokenizer(args): model_config = UnilmConfig.from_pretrained( args.config_name if args.config_name else 'unilm-base-cased', cache_dir=args.cache_dir if args.cache_dir else None) config = BertForSeq2SeqConfig.from_exist_config( config=model_config, label_smoothing=args.label_smoothing, max_position_embeddings=args.max_source_seq_length + args.max_target_seq_length) logger.info("Model config for seq2seq: %s", str(config)) tokenizer = UnilmTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else 'unilm-base-cased', do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) generator = BertForSequenceToSequence.from_pretrained( 'unilm-base-cased', config=config, model_type='unilm', reuse_position_embedding=True, cache_dir=args.cache_dir if args.cache_dir else None) generator.to(args.device) classifer = Classifier(config.hidden_size, args.num_labels) classifer.to(args.device) logger.info("Initialize retriever.") retriever = Retriever(args, tokenizer) return generator, classifer, tokenizer, retriever
def task3b(model): stem = Stemmer() r = Retriever() stem_total_corpus = stem.build_stemmed_data() stem_inv_index = stem.build_stemmed_index() fa = FileAccess() relevance_data = fa.get_relevance_data() stemmed_queries = fa.get_stem_queries() r.run_all_queries(inverted_index=stem_inv_index, total_corpus=stem_total_corpus, relevance_data=relevance_data, query_dict=stemmed_queries, model=model, task_id="3b", notes="stemmed", store_queries='stemmed')
def task1(notes=''): r = Retriever() fa = FileAccess() query_dict = fa.read_queries() corpus = r.get_corpus(True) inverted_index = corpus[0] total_corpus = corpus[1] relevance_data = fa.get_relevance_data() for model in models: r.run_all_queries(inverted_index=inverted_index, total_corpus=total_corpus, relevance_data=relevance_data, query_dict=query_dict, model=model, task_id="1", notes=notes)
def task3a(model): stop = Stopper() stopped_corpus = stop.build_stopped_inverted_index() stop_inv_index = stopped_corpus[0] stop_total_corpus = stopped_corpus[1] fa = FileAccess() r = Retriever() query_dict = fa.read_queries() relevance_data = fa.get_relevance_data() stopped_queries = stop.get_stopped_queries(query_dict) r.run_all_queries(inverted_index=stop_inv_index, total_corpus=stop_total_corpus, relevance_data=relevance_data, query_dict=stopped_queries, model=model, task_id="3a", notes="stopped", store_queries='stopped')
def get_clusters(): html = '<html><body><h3>HN Cluster groups</h3>' r = Retriever() allwords, index, doc_to_title = r.retrieve() c = Clustering() root, cluster_doc_map = c.hcluster(allwords, index) relevant_clusters = c.subclusters(root, 0.90) singles = [] for cluster in relevant_clusters: item_c = c.subcluster_items(cluster, cluster_doc_map, doc_to_title) if len(item_c) == 1: singles.append(item_c[0]); continue for item in item_c: html += '<a href="%s">%s</a><br>' % (doc_to_title[cluster_doc_map[item]][1], doc_to_title[cluster_doc_map[item]][0]) html += '<hr><br><br>' html += '<h3>Single clusters</h3>' for item in singles: html += '<a href="%s">%s</a><br>' % (doc_to_title[cluster_doc_map[item]][1], doc_to_title[cluster_doc_map[item]][0]) html += '</body></html>'
def snippet_generation(): r = Retriever() fa = FileAccess() query_dict = fa.read_queries() query_id = raw_input('Enter the query_id: \n') if int(query_id) > 64 or int(query_id) < 1: print 'No Query exists, please enter between 1 to 64' return query = query_dict[int(query_id) - 1] print 'Query: ' + query fa = FileAccess() relevance_data = fa.get_relevance_data() corpus = r.get_corpus(True) inverted_index = corpus[0] total_corpus = corpus[1] results = r.run_all_queries(inverted_index=inverted_index, total_corpus=total_corpus, relevance_data=relevance_data, query_dict=query_dict, model='cosine', task_id="1", notes='', ret=True) results = results[0:4] snippet_dictionary = {} for each in results: docid = each[1] data = total_corpus[docid] data = " ".join(data) sg = SnippetGenerator() snippet = sg.generate_snippet(data, query) snippet_dictionary[docid] = snippet print '\n' for each in results: print 'Doc-Id: ' + each[1] print snippet_dictionary[each[1]] print '\n'
def __init__(self, ret=Retriever(), track_name='Radioactive', alg_type='affprop'): self.ret = ret # initalize the Retriever class self.dataset = self.ret.retrieve(track_name) # get the data set self.data = normalize(self.dataset.data.tolist()) # normalize the data self.track_ids = self.dataset.labels.tolist() # get the track ids self.reduced_data = self.reduce_data() # reduce the data self.alg_type = alg_type # get the algorithm type self.alg = self.run_clustering() # setup the clustering self.clusters = self.alg.fit(self.reduced_data) # cluster the data
def task2(model): fa = FileAccess() r = Retriever() query_dict = fa.read_queries() corpus = r.get_corpus(True) inverted_index = corpus[0] total_corpus = corpus[1] relevance_data = fa.get_relevance_data() task1_folder = os.path.join(os.getcwd(), 'task1') file_name = "task1_" + model + "_.txt" result_file = task1_folder + '/' + file_name qe = QueryExpander(query_dict=query_dict, filename=result_file, clean=True) expanded_queries = qe.get_expanded_queries() r.run_all_queries(inverted_index=inverted_index, total_corpus=total_corpus, relevance_data=relevance_data, query_dict=expanded_queries, model='cosine', task_id="2", notes="expanded", store_queries='expanded')
def __init__(self): self.visited_links = [] self.links_queue = deque([]) self.domain = '' self.same_domain = True self.pageRetriever = Retriever() self.downloader = Downloader() self.linkanalyser = linkanalyser.LinkAnalyzer() logging.basicConfig( filename = 'crawler.log', format = '%(levelname)s:%(message)s', level = logging.INFO )
def get_clusters(): html = '<html><body><h3>HN Cluster groups</h3>' r = Retriever() allwords, index, doc_to_title = r.retrieve() c = Clustering() root, cluster_doc_map = c.hcluster(allwords, index) relevant_clusters = c.subclusters(root, 0.90) singles = [] for cluster in relevant_clusters: item_c = c.subcluster_items(cluster, cluster_doc_map, doc_to_title) if len(item_c) == 1: singles.append(item_c[0]) continue for item in item_c: html += '<a href="%s">%s</a><br>' % ( doc_to_title[cluster_doc_map[item]][1], doc_to_title[cluster_doc_map[item]][0]) html += '<hr><br><br>' html += '<h3>Single clusters</h3>' for item in singles: html += '<a href="%s">%s</a><br>' % (doc_to_title[ cluster_doc_map[item]][1], doc_to_title[cluster_doc_map[item]][0]) html += '</body></html>'
def getData(): types = request.args.get('type') if types == '1': words = request.args.get('words') if len(words) == 0: response = initialPos else: pos = word_to_pos[words] pos = [pos_to_hanzi[each] for each in pos] response = pos return jsonify(response) else: choice_list = ['bert', 'word2vec', 'none'] choice_index = int(request.args.get('choice')) choice = choice_list[choice_index] query_list = request.args.get('words').split(',') sen_list, ans_pos_list = se.query([each for each in query_list if len(each) > 0]) pos_list = request.args.get('posList').split(',') pos_list = [hanzi_to_pos[each] for each in pos_list] relative_list = request.args.get('relative').split(',') relativeObject_list = request.args.get('relativeObject').split(',') re = Retriever(sen_list, query_list, pos_list_document=ans_pos_list, pos_list_query=pos_list, relative=relative_list, relativeObject=relativeObject_list) response = re.rank() if list(set(pos_list)) != ['all']: response = re.filter_by_pos() if list(set(relative_list)) != ['不限']: response = re.filter_relative() response = ["".join(each) for each in response] if choice == 'none': return jsonify(response[:50]) elif choice == 'bert': new_sen = bertranker.rank(query_list, response[:50]) return jsonify(new_sen) else: new_sen = word2vecranker.rank(query_list, response[:50]) return jsonify(new_sen)
def on_get(self, req, resp): validate_params = True resp.status = falcon.HTTP_200 if 'base_url' not in req.params: validate_params = False if 'category' not in req.params: validate_params = False num_books = 3 if (validate_params is True): connect(database_name) books = Retriever.retrieve_books(req.params['base_url'], req.params['category'], num_books) Operations.insertBooks(books) resp.body = dumps("Inserted books in database") resp.status = falcon.HTTP_200
def main(): args = parser.parse_args() # Define all command-line mutable arguments thread_amount = int(args.threads) if args.threads else 2 timestamp = int(args.start) if args.start else 1420070400000 max_retries = int(args.retries) if args.retries else 5 logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s: %(message)s') file_handler = logging.FileHandler('scraper.log') file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.debug('Started downloading items.') scraper = cfscrape.create_scraper() items = cached(scraper, '.data/items.json', 'https://rsbuddy.com/exchange/names.json') # Write caching file to .data folder write_file('.data/items.json', json.dumps(items)) logger.debug('Finished downloading items.') # Divide the items into buckets for the retrievers to # process item_ids = list(items.keys()) shuffle(item_ids) item_id_buckets = np.array_split(item_ids, thread_amount) logger.debug('Started downloading item history.') # Create all retrievers and run them as a seperate # thread threads = [] for item_id_bucket in item_id_buckets: thread = Retriever(item_id_bucket, timestamp, cfscrape.create_scraper(), max_retries, logger) thread.start() threads.append(thread) for thread in threads: thread.join() logger.debug('Scraper finished.')
class TestRetrieverMethods(unittest.TestCase): def setUp(self): self.retriever = Retriever() self.output = [] def fake_print(content): self.output.append(content) self.retriever.print = fake_print self.text_response = '{"success": true, "data": [209], "length": 1, "type": "uint8"}' self.json_response = {'type': 'uint8', 'data': [209], 'length': 1, 'success': True} def tearDown(self): self.output = None self.retriever = None def test_fetch(self): self.retriever.render = MagicMock() # Check empty url response = self.retriever.fetch("") self.assertIsNone(response) self.retriever.render.assert_not_called() # Check invalid url response = self.retriever.fetch("Invalidurl") self.assertIsNone(response.value) # Still None, still loading response.join() # Force the response to be retrieved self.assertIsInstance(response.value.exception, Exception) self.retriever.render.assert_not_called() def test_process_response(self): response = Mock() response.status_code = 200 response.text = self.text_response processed_response = self.retriever.process_response(response) self.assertEqual(response, processed_response) self.assertEqual(len(self.output), 3) self.assertEqual(self.output[2], 209) # The single value we have in our data self.output = [] # Reset output empty_response = Mock() empty_response.status_code = 400 self.retriever.process_response(empty_response) self.assertEqual(len(self.output), 2) self.assertEqual(self.output[1], "Response not valid. Status Code 400") def test_render(self): self.retriever.render(self.json_response) self.assertEqual(len(self.output), 2) self.assertEqual(self.output[1], 209) # The single value we have in our data self.output = [] self.retriever.render(self.text_response) # Bad response. Still a text self.assertEqual(len(self.output), 1) self.assertEqual(self.output[0], "Response not valid, please provide a valid API")
def __init__(self): shelve('database1', 'c') self.term_extractor = parser.ExtractTerms() self.retriever = Retriever()
def main(): retriever = Retriever() retriever.start()
def __init__(self): shelve("database1", "c") self.term_extractor = parser.ExtractTerms() self.retriever = Retriever()
from flask import Flask, request from retriever import Retriever from flask import jsonify app = Flask(__name__) retriever = Retriever() @app.route('/weather/data') def data(): lat = float(request.args.get('lat')) lon = float(request.args.get('lon')) return jsonify(retriever.get_data_for_location(lat, lon)) @app.route('/weather/summarize') def summarize(): lat = float(request.args.get('lat')) lon = float(request.args.get('lon')) return jsonify(retriever.get_summary_for_location(lat, lon)) if __name__ == '__main__': app.run()
from parser import PlayParser from retriever import Retriever import sqlite3 print("Starting...") retriever = Retriever() def query(c, sql, args=[]): print(sql) c.execute(sql, args) latest_files = retriever.latest_two_filenames() latest_dates = [retriever.today, retriever.yesterday] for (filename, date) in zip(latest_files, latest_dates): print ("Importing {0} for date {1}".format(filename, date.strftime("%Y%m%d"))) parser = PlayParser(date, filename) plays = parser.parse() conn = sqlite3.connect('db/top40.db') for play in plays: c = conn.cursor() # find/insert artist query(c, 'select artist_id from artists where name=?', [play.artist]) rows = c.fetchall() print(len(rows)) if len(rows) < 1: query(c, 'insert into artists (name) values (?)', [play.artist]) conn.commit()
fo = open(fileName, "r") except IOError: print "File", fileName, "cannot be read" sys.exit() return fo # Main function from where program starts if __name__ == '__main__': # Open the predefined problem domain related info file here fo = openFile('static_value.txt') exec('content=' + fo.read()) # Create the Retriever module, given case based memory file and # problem domain related info file rt = Retriever('generated_plan.txt', 'state_similarity.txt') # Create a current state and goal state to test our program # Create the current state here currentState = State('current') currentState.target = 'me' currentState.loc = {'me':'IU Campus', 'bus':'bus_stop_2'} currentState.cash = {'me':3050} currentState.balance_bank = {'me':120} currentState.time = {'me':1820} currentState.nearest_bus_stop={'me': 'bus_stop_2'} # Put the domain related info into the state for stateItem in content: exec("currentState."+stateItem)
model_args.ff_embed_dim, model_args.num_heads, model_args.dropout, model_args.enc_layers, model_args.dec_layers, model_args.label_smoothing) elif model_args.arch == 'mem': model = MemGenerator(vocabs, model_args.embed_dim, model_args.ff_embed_dim, model_args.num_heads, model_args.dropout, model_args.mem_dropout, model_args.enc_layers, model_args.dec_layers, model_args.mem_enc_layers, model_args.label_smoothing, model_args.use_mem_score) elif model_args.arch == 'rg': retriever = Retriever.from_pretrained( model_args.num_retriever_heads, vocabs, args.index_path if args.index_path else model_args.retriever, model_args.nprobe, model_args.topk, args.device, use_response_encoder=(model_args.rebuild_every > 0)) model = RetrieverGenerator( vocabs, retriever, model_args.share_encoder, model_args.embed_dim, model_args.ff_embed_dim, model_args.num_heads, model_args.dropout, model_args.mem_dropout, model_args.enc_layers, model_args.dec_layers, model_args.mem_enc_layers, model_args.label_smoothing) if args.hot_index is not None: model.retriever.drop_index() torch.cuda.empty_cache() model.retriever.update_index(args.hot_index, model_args.nprobe)
def __init__(self, args, wikidata): self.data_path = "data/{}/{}-{}.qa.json".format( args.data, args.data, args.data_type) self.tagme_path = "data/{}/{}-{}.tagme.json".format( args.data, args.data, args.data_type) self.wikidata = wikidata self.retriever = Retriever(args) with open(self.data_path, 'r') as f: orig_data = json.load(f) with open(self.tagme_path, 'r') as f: tagme_data = json.load(f) assert len(orig_data) == len(tagme_data) print("Loaded {} QA data".format(len(orig_data))) self.save_path = "data/{}/{}-{}.retrieved.json".format( args.data, args.data, args.data_type) #### data to save ### n_cross_relations = [] n_inner_relations = [] n_total_relations = [] data_to_save = [] N_TFIDF = 5 if args.data == "webquestions" else 10 N_BM25 = 40 if args.data == "webquestions" else 80 for i, (d, tags) in tqdm(enumerate(zip(orig_data, tagme_data))): if len(tags) > 0: sorted_tags = sorted(tags, key=lambda x: -x['score'] ) if 'score' in tags[0] else tags.copy() tags = [] for e in sorted_tags: # for some reason, tagme keeps tagging "The Who" for "who" questions. # we will exclude them. if not ( (e['entity'] == 'The Who' and e['mention'] == 'who') or e["entity"] == "song"): if e['entity'] not in tags: tags.append(e['entity']) tfidf_docs = self.retriever.get_titles_from_query( d['question'], N_TFIDF) for t in tfidf_docs: if t not in tags: tags.append(t) keywords = self.wikidata.populate(tags, k=args.n_hops, use_aliases=False) collected_docs = set() collected_paragraphs = [] paragraphs_to_run_bm25 = [] for (doc_name, hop, relation) in keywords[:80]: if doc_name in collected_docs: continue collected_docs.add(doc_name) contents = self.retriever.get_contents_from_title( doc_name, n_words=self.retriever.get_n_words(d['question'], doc_name), only_first=hop > 0) if len(contents) == 0: continue collected_paragraphs.append((contents[0], hop, relation)) assert hop == 0 or len(contents) == 1 paragraphs_to_run_bm25 += [(content, relation) for content in contents[1:]] collected_paragraphs = [ par for i, par in sorted(enumerate(collected_paragraphs), key=lambda x: (x[1][1], x[1][0][1], x[0])) ] bm25_paragraphs = self.retriever.get_paragraphs_from_documents( d['question'], paragraphs_to_run_bm25, N_BM25, only_first=False, is_tuple=True) pars = [(par, rel) for par, hop, rel in collected_paragraphs if hop == 0] pars_1 = [(par, rel) for par, hop, rel in collected_paragraphs if hop == 1] for p_i in range(len(bm25_paragraphs)): if len(pars_1) > p_i: pars.append(pars_1[p_i]) pars.append(bm25_paragraphs[p_i]) pars += self.retriever.get_paragraphs_from_documents( d['question'], pars_1[len(bm25_paragraphs):], 100, only_first=False, is_tuple=True) pars += self.retriever.get_paragraphs_from_documents( d['question'], [(par, rel) for par, hop, rel in collected_paragraphs if hop > 1], 100, only_first=False, is_tuple=True) # truncate pars to be 100 at maximum pars = pars[:100] relations = [p[1] for p in pars] pars = [p[0] for p in pars] # get graph information for the GrpahReader collected_docs = set([par[0] for par in pars]) graph = self.wikidata.get_graph(collected_docs) constructed_graph = {} n_cross, n_inner = 0, 0 for i1, (title1, index1, _) in enumerate(pars): for i2, (title2, index2, _) in enumerate(pars): if i1 == i2: continue if (title1, title2) in graph and index1 == index2 == 0: constructed_graph[(i1, i2)] = graph[(title1, title2)] n_cross += 1 if title1 == title2 and index1 == 0 and index2 > 0: constructed_graph[(i1, i2)] = ["<CHILD_PARAGRAPH>"] constructed_graph[(i2, i1)] = ["<PARENT_PARAGRAPH>"] n_inner += 2 n_cross_relations.append(n_cross) n_inner_relations.append(n_inner) n_total_relations.append(n_cross + n_inner) data_to_save.append( json.dumps({ 'question': d['question'], 'answers': d['answers'], 'paragraphs': pars, 'graph': { '{} {}'.format(k[0], k[1]): v for k, v in constructed_graph.items() } })) print("Cross", np.mean(n_cross_relations)) print("Inner", np.mean(n_inner_relations)) print("Total", np.mean(n_total_relations)) with open(self.save_path, 'w') as f: f.write("\n".join(data_to_save))
class Eris: def __init__(self): self.output = os.path.join(config.logsDir, "output.log") def start(self): try: with open(config.statusFile, "r") as f: if f.read() == config.STATUS_LINE: return except: pass try: with open(config.statusFile, "w") as f: f.write(config.STATUS_LINE) except IOError as e: print >>sys.stderr, e return if self.daemonize(): return self.startTime = datetime.now() self.storage = Storage() self.btserver = BtServer(self.storage) self.retriever = Retriever(self.storage) self.btserver.start() self.retriever.start() daemon = Pyro4.Daemon(port=config.pyroPort) uri = daemon.register(self, config.PNAME) log.info("Eris daemon URI: [{}]".format(uri)) self.running = True daemon.requestLoop(loopCondition=lambda: self.running) daemon.unregister(config.PNAME) daemon.close() def stop(self): log.info("Closing eris") try: self.btserver.kill() self.retriever.kill() self.retriever.join(1.0) self.btserver.join(1.0) with open(config.statusFile, "w"): pass except: log.exception("Something went wrong while shutting down") self.running = False def status(self): pid = os.getpid() proc = psutil.Process(pid) cpu = proc.get_cpu_percent() mem = proc.get_memory_percent() uptime = datetime.now() - self.startTime du = self.storage.size() return (pid, cpu, mem, uptime, du) def put(self, packets): self.storage.put(packets) def get(self, since=0, to=0, limit=0): connId, _ = self.storage.get(since, to, limit) return self.storage.fetchall(connId) def count(self): return self.storage.rowcount() def ping(self): return config.PNAME def daemonize(self): try: pid = os.fork() if pid > 0: return True except OSError as e: sys.stderr.write("Fork #1 failed: {} ({})\n".format(e.errno, e.strerror)) sys.exit(1) os.chdir("/") os.setsid() try: pid = os.fork() if pid > 0: sys.exit(0) except OSError as e: sys.stderr.write("Fork #2 failed: {} ({})\n".format(e.errno, e.strerror)) sys.exit(1) sys.stdout.flush() sys.stderr.flush() with open(self.output, "w"): pass out = file(self.output, "a+", 1) os.dup2(out.fileno(), sys.stdout.fileno()) os.dup2(out.fileno(), sys.stderr.fileno()) return False @staticmethod def getProxy(): uri = "PYRO:{}@localhost:{}".format(config.PNAME, config.pyroPort) return Pyro4.Proxy(uri)
class Crawler(object): """crawler goes out to the web and downloads the web pages """ _invalidExt = [ '.pdf', 'jpg', 'jpeg', '.doc', 'docx', '.gif', '.zip', '.rar', '.PDF' ] def __init__(self): self.visited_links = [] self.links_queue = deque([]) self.domain = '' self.same_domain = True self.pageRetriever = Retriever() self.downloader = Downloader() self.linkanalyser = linkanalyser.LinkAnalyzer() logging.basicConfig( filename = 'crawler.log', format = '%(levelname)s:%(message)s', level = logging.INFO ) def crawlPage(self, url, same_domain = True): retrieverResponse = self.downloader.CDownload(url) if retrieverResponse == 0: print retrieverResponse, "Invalid Url.....parsing skipped\n" return self.visited_links.append(url) try: links = self.pageRetriever.getLinks(url) self.linkanalyser.analyze(url, links) except Exception: return for link in links: if link not in self.visited_links: if same_domain == True: if urlparse(link)[1] != self.domain: #print link, " *** discarded for crawl .. not in domain" logging.info("%s * discarded for crawl .. not in domain"%link) else: if link not in self.links_queue: if splitext(link)[1] not in self._invalidExt: self.links_queue.append(link) #print link, " *** new link added to crawl queue" logging.info("%s * new link added to crawl queue"%link) else: #print link,"*** discarded already visited" logging.info("%s * discarded already visited"%link) if same_domain == False: if link not in self.links_queue: self.links_queue.append(link) #print link," *** new link added to crawl queue" logging.info("%s * new link added to crawl queue"%link) else: #print link,"*** discarded already visited" logging.info("%s *** discarded already visited"%link) print "length of queue is ", len(self.links_queue), "len of visited queue is ", \ len(self.visited_links) logging.info("length of queue is %d length of visited queue is %d"\ %(len(self.links_queue), len(self.visited_links))) def start_crawl(self, url, same_domain = True): self.links_queue.append(url) self.domain = urlparse(url)[1] self.same_domain = same_domain # process links in queue while self.links_queue: url = self.links_queue.popleft() self.crawlPage(url)