def add_to_index(self, corpus_dir, corpus_type, stop_file=None, tag_file=None, synch_freq=10000): """Appends entries from @corpus_dir into the index """ corpus_name = os.path.basename(os.path.normpath(corpus_dir)) if corpus_name not in self.index_contents: index_file = os.path.join(self.directory, 'index.txt') word_count_file = os.path.join( self.directory, 'total_word_count.txt') index.build_index(corpus_dir, corpus_type, stop_file, index_file, tag_file, word_count_file, synch_freq) value = (corpus_type, stop_file, tag_file) self.index_contents[corpus_name] = value self.save_instance()
def test_index_database(self): index_object = index.index('test_data/config_file.yaml') index.build_index(index_object, 'test_data/config.ini') db_keys = index.load_config_file('test_data/config.ini') connection = psycopg2.connect(database=db_keys.get('database'), user=db_keys.get('user'), password=db_keys.get('password')) cursor = connection.cursor() cursor.execute("SELECT * FROM index order by 1,2;") table_content = cursor.fetchall() self.assertEqual([('as', 'test_data/full_file_dirty.txt', [1, 2]), ('bulgaria', 'test_data/full_file_dirty.txt', [1, 3]), ('bulgaria', 'try/try.txt', [22, 84, 99]), ('bush', 'test_data/full_file_dirty.txt', [1]), ('the', 'test_data/full_file_dirty.txt', [1])], table_content)
def main() -> None: """ Driver method """ # Read arguments filename = sys.argv[1] products = sys.argv[2:] # Load data and create index items = data.read_data(filename) item_index, shop_index = index.build_index(items) # Find relevant shops shops = utils.find_shops(item_index, products) # Calculate price for each of them and store in list of tuples format # with first index as shop ID and second as final price - # [ # (shop1, total1), # (shop2, total2), # ... # ] price_list = [(i, utils.calculate_price(shop_index, products, i)) for i in shops] if price_list: best_price = min(price_list, key=lambda x: x[1]) # Find the best price else: # No relevant shop found best_price = None print(best_price)
def add_to_index(self, corpus_dir, corpus_type, stop_file=None, tag_file=None, synch_freq=10000): """Appends entries from @corpus_dir into the index """ corpus_name = os.path.basename(os.path.normpath(corpus_dir)) if corpus_name not in self.index_contents: index_file = os.path.join(self.directory, 'index.txt') word_count_file = os.path.join(self.directory, 'total_word_count.txt') index.build_index(corpus_dir, corpus_type, stop_file, index_file, tag_file, word_count_file, synch_freq) value = (corpus_type, stop_file, tag_file) self.index_contents[corpus_name] = value self.save_instance()
def main(): args = get_args() time_start = datetime.utcnow() logging.info('Index building started...') index.build_index(args.embedding_files, LOCAL_INDEX_FILE, args.num_trees) time_end = datetime.utcnow() logging.info('Index building finished.') time_elapsed = time_end - time_start logging.info('Index building elapsed time: {} seconds'.format(time_elapsed.total_seconds())) time_start = datetime.utcnow() logging.info('Uploading index artefacts started...') upload_artefacts(args.index_file) time_end = datetime.utcnow() logging.info('Uploading index artefacts finished.') time_elapsed = time_end - time_start logging.info('Uploading index artefacts elapsed time: {} seconds'.format(time_elapsed.total_seconds()))
def create_index(): build_index() return 'create index'
parser.add_argument('-b', nargs='*', default=[], help=u'build,建立新的索引;格式:“频道名”'.encode('GBK')) parser.add_argument('-u', nargs='?', const='all', default=False, help=u'update,更新索引;无参数则更新全部,格式:“频道名”'.encode('GBK')) parser.add_argument('-n', nargs='?', const=False, default=True, help=u'not,若不下载图片,输入此提示符,无其它参数'.encode('GBK')) parser.add_argument('-c', nargs='*', default=[], help=u'channel,频道名;格式:“频道名(1) 频道名(2)”'.encode('GBK')) parser.add_argument('-t', nargs='*', default=change_time(), help=u'time,时间范围;格式:“年.月.日(1) 年.月.日(2)”'.encode('GBK')) parser.add_argument('-p', nargs='*', default={}, help=u'path,保存路径;格式:“频道(1) 路径(1) 频道(2) 路径(2)”'.encode('GBK')) args = parser.parse_args() if args.a: info.add(args.a, args.ad) if args.s: info.show(args.s) if args.d: info.delete(args.d) if args.b: index.build_index(args.b[0]) if args.u: index.update_what(args.u) if args.n: downlaod.download(args.c, args.t, args.p)
## Adicionando ambiente Django print "Adicionando ambiente Django..." sys.path.insert(0, '/Users/phillipe/Projects/storyline') os.environ['DJANGO_SETTINGS_MODULE'] = 'project.settings' from apps.search.models import Article except ImportError: print "Ocorreu um erro na importação dos módulos necessários\ para a construção do índice e inclusão dos documentos." raise # Criando índice try: print "O índice está sendo construído, ou obtido (se já existir)..." idx = index.build_index() except: print "Não foi possível obter o índice." raise try: print "Lendo os documentos a serem indexados..." docs = Article.objects.all() except Article.DoesNotExist: print "Não foi possível carregar a lista de documentos." raise try: print "Adicionando os documentos ao índice..." add_docs(docs, idx) print "Total de documentos indexados:", idx.doc_count()
import index if __name__ == "__main__": print() print("Welcome to Hannah and Rey's Search Engine!") print() corpus = sys.argv[1] file_directory = json_to_dict(corpus) # print(file_directory["0/1"]) index_file = 'index.txt' i = open(index_file, 'w') index.build_index(file_directory, corpus) # i.write("hello world") index.write_index_to_file(i) # print(index.dictionary) print("# VISITED DOCUMENTS: " + str(index.visitedDocuments)) print("UNIQUE WORDS: " + str(len(index.dictionary))) inf_q = 'informatics_q.txt' mondego_q = 'mondego_q.txt' irvine_q = 'irvine_q.txt' inf = open(inf_q, 'w') mon = open(mondego_q, 'w') irv = open(irvine_q, 'w') inf_query = enter_query()
import os, sys, index ## Adicionando ambiente Django print "Adicionando ambiente Django..." sys.path.insert(0, '/Users/phillipe/Projects/storyline') os.environ['DJANGO_SETTINGS_MODULE'] = 'project.settings' from apps.search.models import Article except ImportError: print "Ocorreu um erro na importação dos módulos necessários\ para a construção do índice e inclusão dos documentos." raise # Criando índice try: print "O índice está sendo construído, ou obtido (se já existir)..." idx = index.build_index() except: print "Não foi possível obter o índice." raise try: print "Lendo os documentos a serem indexados..." docs = Article.objects.all() except Article.DoesNotExist: print "Não foi possível carregar a lista de documentos." raise try: print "Adicionando os documentos ao índice..." add_docs(docs, idx)
app = Flask( __name__, static_url_path='', static_folder='frontend/dist' ) app.config['ENV'] = 'development' INDEX_URL = "http://*****:*****@app.route('/search', methods=['POST']) def search(): content = request.get_json(silent = True) query = content['query'] start = time.time() docs = process_query(query, index, 100) result = {} result['documents'] = [] for score, docId in docs: line = linecache.getline(RAW_TSV_FILE, docId + 1).strip().split('\t') curr_doc = {}
try: cur_dir = os.path.dirname(os.path.abspath(__file__)) except: cur_dir = os.getcwd() sys.path.append(os.path.join(cur_dir, 'resources', 'lib')) import index, play args = urlparse.parse_qs(sys.argv[2][1:]) play_arg = args.get('play', None) sid_arg = args.get('sid', None) eid_arg = args.get('eid', None) category_arg = args.get('category', None) vtid_arg = args.get('vtid', None) nid_arg = args.get('nid', None) if play_arg is not None: play.play(url=play_arg[0], nid=nid_arg[0]) elif sid_arg is not None and category_arg is not None and vtid_arg is not None: index.build_index(sid=sid_arg[0], category=category_arg[0], vtid=vtid_arg[0]) elif sid_arg is not None and category_arg is not None: index.build_index(sid=sid_arg[0], category=category_arg[0]) elif sid_arg is not None and eid_arg is not None: index.build_index(sid=sid_arg[0], eid=eid_arg[0]) elif sid_arg is not None: index.build_index(sid=sid_arg[0]) else: index.build_index()