def trans(src, dst, index, label_index, mode='w', sep='/'): lid = indexer.Indexer(label_index, mode) inder = indexer.Indexer(index, mode) #lid=indexer.Indexer(label_index,'r') #print(lid,mode) file = open(dst, 'wb') ln = 0 #print(src) for line in open(src, encoding='utf8'): ln += 1 #print(line) wts = [x.rpartition(sep) for x in line.strip().split(' ')] if sep == ' ': tags = ['' for x in wts] line = [x[-1] for x in wts] else: tags = [x[-1] for x in wts] line = [x[0] for x in wts] #print(len(wts)) #input() seq = ''.join(line) #if (mode=='r'): #print(seq) #input() graph = [] fs = [ filter(lambda x: x >= 0, [inder(k) for k in gen_keys(seq, x)]) for x in range(len(seq)) ] for c, v in zip(_to_tags(tags, line, lid), fs): graph.append([0, [], c, v]) if not graph: continue graph[0][0] += 1 graph[-1][0] += 2 for i in range(1, len(graph)): graph[i][1] = [i - 1] json_to_binary.graph_to_file(graph, file) if ln % 1000 == 0: print(ln) #if ln>5000:break file.close() print(len(inder)) print('the end')
def main(): indexing = indexer.Indexer('database') file_name, query, function = entering() indexing.indexing_with_lines(file_name) searching = SearchEngine('database') result = None if function == 1: result = searching.multiple_tokens_search(query) elif function == 2: context_size = int(input('Enter the context size to search for:\n')) result = searching.search_to_sentence(query, context_size) elif function == 3: context_size = int(input('Enter the context size to search for:\n')) result = searching.search_to_highlight(query, context_size) elif function == 4: exit() else: print('You entered a wrong function number, try again') entering() with open('result.txt', 'w') as result_file: for key in result: result_file.write(key + ':\n') for el in result[key]: result_file.write(str(el) + '\n') result_file.write('\n') del searching for filename in os.listdir(os.getcwd()): if filename == 'database' or filename.startswith('database.'): os.remove(filename)
def main(): indexing = indexer.Indexer("database") ## file = open('text.txt', 'w') ## file.write('На небе. Много. Фиолетовых облачков') ## file.close() ## file2 = open('text2.txt', 'w') ## file2.write('На розоватом. Небе небе много облачков маленьких. J') ## file2.close() ## indexing.index_with_lines('text2.txt') ## file3 = open('text3.txt', 'w') ## file3.write('На голубом преголубом небе небе много облачков много облачков небе. \n птичек много облачков \n звезд') ## file3.close() ## indexing.index_with_lines('text3.txt') ## indexing.closeDatabase() #search = SearchEngine("vim") ## tokenquery = "небе" ## tokenquery2 = "много облачков" ## searchresult = search.highlighted_context_window_search(tokenquery2) #contextsearch1 = search.lim_off_context_window_search_acc('князь Андрей', 3, 0, [[3,0],[3,0],[3,0]]) # print(contextsearch1) ## search.closeDatabase() testfile = open("text.txt", 'w') testfile.write("There are only fluffy kittens!") testfile.close() testfile2 = open("text2.txt", 'w') testfile2.write("only kittens and puppies...") testfile2.close() indexing.index_with_lines("text2.txt") indexing.index_with_lines("text.txt") testsearch = SearchEngine('database') # context '3,3' windowsdict = testsearch.several_tokens_search_with_customizable_context_acc("only kittens", 3, 3, 3, -1) print(windowsdict)
def index(): Indexer = indexer.Indexer() for review in loader.LoadDataSeq(): toneScore = analyzer.GetToneAnalysis(review["reviews.text"]) review["toneScore"] = toneScore Indexer.Index(_id, review)
def main(): index = indexer.Indexer('db') d = open('tgt.txt', 'w') d.write('this is a test required for helping. students create a test\n') d.write(' professor required to write a test first') d.close() index.indexing_with_lines('tgt.txt') t = open('ttt.txt', 'w') t.write('test is required. On the other hand...') t.close() index.indexing_with_lines('ttt.txt') del index engine = SearchEngine('db') search = engine.search_multiple('test') #result = engine.search_one_context('tgt.txt', Position_with_lines(11, 19, 1), 1) #result_multiple = engine.search_multiple_contexts(search, 2) #today = engine.limit_quote_search('test', 2, 0, [(1, 1), (1, 0)]) today = engine.limit_quote_context_search('test', -2, 0, [(2, 0), (1, 0)]) re = engine.search_extended_context('test', 2) print(today) #print(re) del engine if 'tgt.txt' in os.listdir(os.getcwd()): os.remove('tgt.txt') if 'ttt.txt' in os.listdir(os.getcwd()): os.remove('ttt.txt') for filename in os.listdir(os.getcwd()): if filename == 'db' or filename.startswith('db.'): os.remove(filename)
def run(self): self.extract_queries() idx = Indexer.Indexer() idx.run() print "Running Queries on index:" i = 0 for query in self.queryTexts: idx.query(self.queryNums[i], query) i += 1
def trans(src, dst, index, label_index, mode='w', sep='/', dictionary=None): lid = indexer.Indexer(label_index, mode) inder = indexer.Indexer(index, mode) if dictionary: dict_feature = DictFeature(dictionary) file = open(dst, 'wb') ln = 0 for line in open(src, encoding='utf8'): ln += 1 wts = [x.rpartition(sep) for x in line.strip().split(' ')] if sep == ' ': tags = ['' for x in wts] line = [x[-1] for x in wts] else: tags = [x[-1] for x in wts] line = [x[0] for x in wts] seq = ''.join(line) graph = [] fs = [[inder(k) for k in gen_keys(seq, x)] for x in range(len(seq))] if dictionary: dict_feature(seq, inder, fs) fs = [list(filter(lambda x: x >= 0, fv)) for fv in fs] #print(fs) #input() for c, v in zip(_to_tags(tags, line, lid), fs): graph.append([0, [], c, v]) if not graph: continue graph[0][0] += 1 graph[-1][0] += 2 for i in range(1, len(graph)): graph[i][1] = [i - 1] json_to_binary.graph_to_file(graph, file) if ln % 1000 == 0: print(ln) #if ln>5000:break file.close() print(len(inder)) print('the end')
def setUp(self): index = indexer.Indexer('dbase') f = open('test.txt', 'w') f.write('this is\ntest') f.close() t = open('tst.txt', 'w') t.write('test') t.close() index.indexing_with_lines('test.txt') index.indexing_with_lines('tst.txt') del index self.s = SearchEngine('dbase')
def setUp(self): index = indexer.Indexer('dbase') f = open('test.txt', 'w') f.write('this is a test required for helping students create a test\n') f.write(' professor required to write a test first') f.close() t = open('tst.txt', 'w') t.write('test is required. On the other hand...') t.close() index.indexing_with_lines('test.txt') index.indexing_with_lines('tst.txt') del index self.s = SearchEngine('dbase')
def build_index(): corpus_path = util.get_corpus_dir_path_from_args() preprocessor = preprocessing.Preprocessor(corpus_path) doc_to_terms: list[preprocessing.DocToTerms] = preprocessor.parse() indexer_ob = indexer.Indexer(doc_to_terms) inverted_index: dict[str, indexer.Posting] = indexer_ob.inverter_index() doc_id_name_index: dict[int, str] = indexer_ob.doc_id_to_doc_name_index() tf_idf_ranker = ranker.Ranker(inverted_index, doc_id_name_index) _tfidf = tf_idf_ranker.tfidf() print('Indexing completed..saving...') util.save_obj(doc_id_name_index, DOC_ID_NAME_INDEX_NAME) util.save_obj(inverted_index, INVERTED_INDEX_FILE_NAME) util.save_pandas_df_as_pickle(_tfidf, TFIDF_NAME_INDEX_NAME) print('Saved index for quick results for future queries')
def main(): ## texts = ['tolstoy1.txt', 'tolstoy2.txt', 'tolstoy3.txt', 'tolstoy4.txt'] ## databases = ['database' + str(i) for i in range(4)] ## indexings, results = [], [] ## for i in range(4): ## indexings.append(indexer.Indexer(databases[i])) ## indexings[i].indexing_with_lines(texts[i]) ## results.append(SearchEngine(databases[i]).search_to_sentence(input('Введите слово'))) ## for result in results: ## print(result) indexing = indexer.Indexer('database') indexing.indexing_with_lines('text.txt') searching = SearchEngine('database') result = searching.search_to_sentence('туманы') print(result) del searching
def test(index,src,dst): inder=indexer.Indexer(index,'r') file=open(dst,'wb') for line in open(src,encoding='utf8'): line=line.split() seq=''.join(line) graph=[] fs=[filter(lambda x:x>=0,[inder(k) for k in gen_keys(seq,x)]) for x in range(len(seq))] for c,v in zip(_to_tags(line),fs): graph.append([0,[],c,v]) if not graph:continue graph[0][0]+=1; graph[-1][0]+=2; for i in range(1,len(graph)): graph[i][1]=[i-1] json_to_binary.graph_to_file(graph,file) print('the end') file.close()
def test_scw_single(self): """test if the program is working correctly when searching for context windows extended to the sentence boundaries for a single word """ k = open('newtest.txt', 'w') k.write('What is your name? My name is test.') k.close() ind = indexer.Indexer('newdb') ind.indexing_with_lines('newtest.txt') del ind self.k = SearchEngine('newdb') result = self.k.search_extended_context('test', 1) output = {'newtest.txt': [Context_Window([Position_with_lines(30, 34, 0)], 19, 35, 'What is your name? My name is test.')]} self.assertEqual(result, output) del self.k for filename in os.listdir(os.getcwd()): if filename == 'newdb' or filename.startswith('newdb.'): os.remove(filename) os.remove('newtest.txt')
def __init__(self, ecran, adventure: Adventure, s: socket.socket = None, p: tuple = ('127.0.0.1', 5500)): self.__start_at__ = 0 self.adventure = adventure # self.fps_regulator = IAFPS(FPS_base) self.fps_regulator = ree.create_clock() self.continuer = 1 self.ecran = ecran self.sock = s self.params = p self.renderer_manager = renderer_manager.RendererManager() self.show_fps = False # Polices self.police_normale = ree.load_font(POLICE_PATH, POL_NORMAL_TAILLE) self.police_grande = ree.load_font(POLICE_PATH, POL_GRANDE_TAILLE) self.police_petite = ree.load_font(POLICE_PATH, POL_PETITE_TAILLE) # Managers self.carte_mgr = carte.CartesManager(self.ecran, self.renderer_manager, self.police_normale) self.oth_persos_mgr = personnage.OthPersonnagesManager( self.ecran, self.carte_mgr) self.indexeur = indexer.Indexer(self.ecran, self.police_grande, self.renderer_manager) self.equipe_mgr = equipe_manager.EquipeManager(self.ecran, self.police_grande, self.indexeur, self.renderer_manager) self.pc_mgr = computer_manager.ComputerManager(self.ecran, self.police_grande, self.renderer_manager) self.tab_types = tab_types.Storage() self.cur_combat = None self.menu_in_game = menu_in_game.Menu(self.ecran, self.police_grande) self.zones_manager = zones_attaques_manager.ZonesManager(self.indexeur) self.money = money_mgr.MoneyManager() self.gui_save_mgr = GUISauvegarde(self.ecran, self.police_grande) self.network_ev_listener = NetworkEventsListener( self.sock, self.params) self.chat_mgr = chat_manager.ChatManager(self.ecran, self.police_normale, self.network_ev_listener, self.adventure.get_pseudo(), RANG_NUL) self.mini_map = carte.CarteRenderer(self.ecran, self.police_normale, self.adventure) self.attaques_table = atk_sys.AttaquesTable() self.parametres = ParametresManager() self.musics_player = music_player.MusicPlayer() # Entités self.personnage = personnage.Personnage(self.carte_mgr, self.ecran, self.police_grande) # Contrôles self.controles = {} self.controles_joy = {} self.joystick = None self.__ctrls = {} self._default_dt = 1.0 self._play_music = True self._play_anims = True
class Employer: id_counter = indexer.Indexer() #初期設定 def __init__(self): self.id = Employer.id_counter.gen() self.employees = [] self.assignment_work_history = [] #self.processed_work = 0.0 self.time = 0 self.examql = None self.renewql = None self.changeql = None #雇用転換 def change_employment(self, employee): state = self.changeql.state(self,employee) return self.changeql.action(state, employee) #入社試験 def exam(self, employee): state = self.examql.state(self,employee) return self.examql.action(state, employee) #契約更新 def renew(self, employee): state = self.renewql.state(self,employee) return self.renewql.action(state, employee) #雇用転換(合格) def regular_employ(self, employee): if employee in self.employees: pstate = self.changeql.state(self,employee) action = True employee.change_regular() nstate = self.changeql.state(self,employee) reward = self.changeql.reward(self) self.changeql.update(pstate, nstate, action, reward) #雇用転換(不合格) def temporary_employ(self, employee): if employee in self.employees: pstate = self.examql.state(self,employee) action = False nstate = self.changeql.state(self,employee) reward = self.changeql.reward(self) self.changeql.update(pstate, nstate, action, reward) #入社試験(合格) def employ(self, employee, gdp_index): if employee not in self.employees: pstate = self.examql.state(self,employee) action = True employee.employed(self,gdp_index) self.employees.append(employee) nstate = self.examql.state(self,employee) reward = self.examql.reward(self) self.examql.update(pstate, nstate, action, reward) #入社試験(不合格) def reject(self, employee): if employee not in self.employees: pstate = self.examql.state(self,employee) action = False nstate = self.examql.state(self,employee) reward = self.examql.reward(self) self.examql.update(pstate, nstate, action, reward) #契約更新(合格) def keep(self, employee): if employee in self.employees: pstate = self.renewql.state(self, employee) action = True nstate = self.renewql.state(self, employee) reward = self.renewql.reward(self) self.renewql.update(pstate, nstate, action, reward) #契約更新(不合格) def fire(self, employee): if employee in self.employees: pstate = self.renewql.state(self, employee) action = False employee.fired(self) self.employees.remove(employee) nstate = self.renewql.state(self, employee) reward = self.renewql.reward(self) self.renewql.update(pstate, nstate, action, reward) #労働者エージェントの辞職 def resign(self, employee): if employee in self.employees: self.employees.remove(employee) #雇用形態をカウント def count_worker_type(self, work_type): return len([e for e in self.employees if e.work_type == work_type]) #正規労働者をカウント def count_regular(self): return self.count_worker_type(WORKER_TYPE.REGULAR) #非正規労働者をカウント def count_temporary(self): return self.count_worker_type(WORKER_TYPE.TEMPORARY) #労働者エージェントをカウント def count_employee(self): return len(self.employees) #GDPから仕事を割り振る def set_work(self, work): self.assignment_work_history.append(work) def assigned_work(self, index=-1): if index < 0: if abs(index) > len(self.assignment_work_history): return 0 else: if index + 1 > len(self.assignment_work_history): return 0 return self.assignment_work_history[index] #行った仕事量 def processed_work(self): return sum([e.work() for e in self.employees]) #残った仕事量 def remained_work(self): return self.assigned_work() - self.processed_work() #時間経過 def elapse(self, time_interval=1): self.time += time_interval #リセットする def clear(self): self.employees.clear() self.assignment_work = 0 self.time = 0 def to_s(self): l = [e.to_s() for e in self.employees] emoloyee_str = ', \n'.join(l) return '<EMPLOER ' + \ 'ID:' + str(self.id) + \ ', ' + \ 'EMPLOYEES = \n' + emoloyee_str + \ 'T_TASK = ' + str(self.assignment_work) + \ 'P_TASK = ' + str(self.processed_work) + \ '\n>'
class Employee: id_counter = indexer.Indexer() ##########初期設定########## def __init__(self): self.id = Employee.id_counter.gen() self.age = WORKER_AGE.LOWER self.work_type = WORKER_TYPE.JOBLESS self.employer = None self.length_of_service = 0 self.rewards = {} self.selection_strategy = None ##########年始処理########## def begin(self): self.elapse() pass ##########どの雇用者エージェント選択するかを決める########## def select_employers(self, employers, num=1): return self.selection_strategy.select(self, employers, num) ##########雇われる(全ての労働者エージェントは非正規労働者からスタートする)######### def employed(self, employer, gdp_index): if self.employer is not None and self.employer != employer: self.retire() self.employer = employer self.work_type = WORKER_TYPE.TEMPORARY self.length_of_service = 0 self.rewards[self.employer] = 0 ##########働く(労働力計算)########## def work(self): #労働力最大ピーク43歳(厚労省) #定年者は新卒の1.5倍の労働力 #1人あたりのGDPは67 # 就業年数に線形で # 年齢による労働効率の二次近似式(年齢によっていくら労働力が変わるか) # f(x) = -0.00236*x^2+0.22*x-3.12325 x:労働年齢 #xが最低労働年齢のとき、f(x)は1 alpha = 0.5 beta = 0.5 gdp_unit = 67 #年齢効率 f = lambda x: -0.00236 * x**2 + 0.22 * x - 3.12325 #勤続年数効率 20 age =1, 55 age = 2 g = lambda x: 0.029464 * x + 0.410714 dp = alpha * f(self.age) + beta * g(self.length_of_service) return gdp_unit * dp #pass ##########給料をもらう########## def salary(self, money): if self.employer is not None: if self.work_type == WORKER_TYPE.REGULAR: self.rewards[self.employer] += money * 2 elif self.work_type == WORKER_TYPE.TEMPORARY: self.rewards[self.employer] += money ##########雇用形態を変更する########## def change(self, work_type): self.work_type = work_type ##########正規雇用者へ転換する########## def change_regular(self): self.work_type = WORKER_TYPE.REGULAR ##########退職する########## def retire(self): #雇用者エージェントがいる場合 if self.employer is not None: self.employer.resign(self) self.employer = None self.work_type = WORKER_TYPE.JOBLESS self.length_of_service = 0 ##########解雇通知する########## def fired(self, employer): self.employer = None self.work_type = WORKER_TYPE.JOBLESS self.length_of_service = 0 ##########時間経過(勤続年数・年齢)########## def elapse(self, time_interval=1): self.length_of_service += time_interval self.age += time_interval ##########年齢を確認する########## def is_worker_age(self): return (WORKER_AGE.LOWER <= self.age <= WORKER_AGE.UPPER) ##########文字列オブジェクト########### def to_s(self): return "<EMPLOEE \n" + \ "ID:" + str(self.id) + ", " + \ "EMPLOYER = " + (str(self.employer.id) if self.employer is not None else "None") + ", " + \ "EMP_TYPE = " + self.work_type + \ "\n>" ##########年末処理########### def end(self): #if the law comes into force operation #self.change(WORKER_TPYE.REGULAR) by the length of service over 5 pass
#!/usr/bin/env python # -*- coding: utf-8 -*- import flask import config import indexer import main import search import settings import static if __name__ == "__main__": indexer = indexer.Indexer(config.INDEX_DIR, config.DOC_DIRS) app = flask.Flask(__name__) app.secret_key = config.SECRET_KEY app.add_url_rule("/", view_func=main.Main.as_view("main")) app.add_url_rule("/search", view_func=search.SearchResult.as_view("search", indexer)) app.add_url_rule("/settings", view_func=settings.SearchSettings.as_view("settings")) for directory in config.DOC_DIRS: app.add_url_rule("/{0}/<path:path>".format(directory), view_func=static.DocumentView.as_view( directory, directory)) app.run("localhost", 8080, False)
def setUp(self): """create an object of Indexer class """ self.i = indexer.Indexer('dbase')
sys.exit(1) #exit interpreter print 'Desired precision@10 for context: {}'.format(arglist[1]) print 'Desired precision@10 for trends: {}'.format(arglist[2]) precisionTenTargBing = float(arglist[1]) #must convert string to float precisionTenTargTwitter = float(arglist[2]) #must convert string to float #'eECeOiLBFOie0G3C03YjoHSqb1aMhEfqk8qe7Xi2YMs=' #connect to client with key arg[1] and post a query with arg[3], query bingClient = bingclient.BingClient(constants.BING_ACCT_KEY) twitterClient = twitterclient.TwitterClient(constants.APP_KEY, constants.APP_SECRET, constants.OAUTH_TOKEN, constants.OAUTH_TOKEN_SECRET) indexer = indexer.Indexer() expandedQueryBing = ' '.join(arglist[3:]) queryOptimizer = rocchio.RocchioOptimizeQuery(expandedQueryBing) firstPass = 1 precisionAtK = 0.00 queryWeights = {} #while precision at 10 is less than desired amt issue a query, obtain new precision metric, expand query, repeat while (precisionAtK < precisionTenTargBing): precisionAtK = 0.00 #reset precision each round #PROCESS A QUERY print 'Parameters' print '%-20s= %s' % ("Query", expandedQueryBing) print '%-20s= %s' % ("Target Precision", precisionTenTargBing)
import flask from flask import request, jsonify from flask_cors import CORS import controller import lookup import indexer import apptrace import settings app = flask.Flask(__name__) cors = CORS(app, resources={r"/v1/*": {"origins": "*"}}) setting = settings.Setting() debug = setting.debugMode trx = apptrace.AppTrace(setting.debugMode) indx = indexer.Indexer(trx, setting) lp = lookup.Lookup(trx, setting) ctrl = controller.Controller(lp, indx, trx, setting) @app.errorhandler(404) def page_not_found(e): return "<h1>404</h1><p>The resource could not be found.</p>", 404 @app.route('/v1/search', methods=['GET']) def search(): query_parameters = request.args enableTrace = False if 'enabletrace' in query_parameters: enableTrace = True
def setUp(self): self.testindexer = indexer.Indexer('database')
head = "".join(doc[1]) text = "".join(doc[2]) headline_toks = ts.stem_tokens(ts.removestops(ts.tokenizeText(head))) text_toks = ts.stem_tokens(ts.removestops(ts.tokenizeText(text))) tdoc = [doc_id, headline_toks, text_toks] term_docs.append(tdoc) print("tokenization, stopping, stemming DONE.") timestep1 = timer() print("after " + str(timestep1 - starttime) + " seconds.") ##create token file #with open("tokens.txt", 'w') as f: # for doc in term_docs: # f.write(str(doc[0])+"\n") # f.write(str(doc[1]+doc[2])+"\n") #create index for docs -> indexer.py index = indexer.Indexer(term_docs) #SAVE INDEX VARIABLE for the query search with open("indexvar.txt", 'wb') as f: pickle.dump(index, f, protocol=-1) #SAVE PRINT VERSION OF INDEX for viewing indexer.PrintIndex2Text(index, "index.txt") print("Positional inverted index created, find print version: 'index.txt'") timestep2 = timer() print("after " + str(timestep2 - timestep1) + " seconds.") ########################################## """ Extra variable: A simple document index, so when document IDs matching a query are found, we can also return the actual
def __init__(self, index): self.fid = indexer.Indexer(index)
import os import indexer if __name__ == '__main__': indexer = indexer.Indexer(index_path='./inverted_index/') path = '/home/tani/wikidump/' for filename in os.listdir(path): if filename.startswith('wikidump') and \ not filename.endswith('bz2'): print('parsing {}...'.format(filename)) indexer.parse_data(os.path.join(path, filename)) indexer.finish_indexing()
def main(): index_dict = {} try: pkl_file = open(indexer.Indexer.filename, 'rb') index_dict = pickle.load(pkl_file) pkl_file.close() except IOError: print "Pickle file not found." indx = indexer.Indexer(index_dict) db_manager = dbmanager.dbmanager(DB_NAME) logging.basicConfig(filename = LOG_NAME, format='%(asctime)s:%(levelname)s:%(message)s', filemode='w', level=logging.WARN) frontier = ['http://www.theonion.com','http://www.reddit.com','https://en.wikipedia.org/wiki/Satire'] visited = {} domains = {} db_visited = db_manager.get_visited() db_frontier = db_manager.get_frontier() frontier += db_frontier #shuffle(frontier) for url in db_visited: print "Already visited: " + url visited[url] = 1 current_threads = 0 threads = [] data = [] t_urls = [] for url in frontier: if visited.get(url, None): logging.info("Not requesting " + url + " because it has already been visited.") continue if domains.get(get_domain(url), 0) >= MAX_REQ_PER_DOMAIN: logging.info("Not requesting " + url + " because max requests per domain has been exceeded.") continue if is_blacklisted(url): logging.info("Not requesting " + url + " because it is blacklisted.") continue if(current_threads < MAX_THREADS): logging.info("Requesting " + url) print "Requesting " + url + " as t=" + str(current_threads) visited[url] = 1 urldom = get_domain(url) if urldom in domains: domains[urldom] += 1 else: domains[urldom] = 1 d = [] data.append(d) t_urls.append(url) t = Requester(url, TIME_LIMIT, d, MAX_SIZE_BYTES) t.start() threads.append(t) current_threads += 1 if((current_threads >= MAX_THREADS) or (url == frontier[-1])): current_threads = 0 for t in threads: t.join() for i in range(len(t_urls)): htmldata = "" if data[i]: htmldata = data[i][0] db_manager.insert_visited(t_urls[i], len(htmldata)) page_urls = list(set(get_urls(t_urls[i], htmldata))) indx.index_page(t_urls[i], htmldata) db_manager.insert_frontier(page_urls, t_urls[i]) frontier += page_urls output_pkl = open(indexer.Indexer.filename, 'wb') pickle.dump(indx.index, output_pkl) output_pkl.close() threads = [] data = [] t_urls = [] db_manager.close()
import time import psutil import indexer as Indexer import document import sys import gc import os import merge import random import calculations # Document id docid = 0 # Collection of Documents, for future purposes #collection = [] #Collection of existing documents indexer = Indexer.Indexer() #Indexer of tokenizer # Begin the timer. start = time.time() def memory_usage_psutil(): # return the memory usage in percentage like top process = psutil.Process(os.getpid()) mem = process.memory_percent() return mem #Now loads all files from input folder def getFiles(path): files = os.listdir(path)
#path = "/Users/shireenhsu/Desktop/121_Assignment3/DEV" #path = "/Users/jason/Desktop/ANALYST" #Actually reading the JSON and merging the files into one output.txt path = input("Enter Path Name: ") files = readFiles(path) doc_id = DocID() manager = IndexerManager(doc_id, files) get_doc_lock = threading.Lock() #locks for multithreading simhash_lock = threading.Lock() indexers = [ indexer.Indexer( "partial(thread" + str(i) + ").txt", manager, #creates and instntiates indexers based on THREADS constant get_doc_lock, simhash_lock, i) for i in range(1, THREADS + 1) ] for indexer in indexers: indexer.start() #starts all indexer threads for indexer in indexers: indexer.join() #waits for all indexer threads mergeFiles( manager.partial_indexes ) #merges the partial indexes written by indexers to the manager doc_id.write_doc_id( "docID.json") #stores the docID dictionary for use later indexIndex("output.txt", "indexindex.json" ) #creates an index of the index for optimized search times