def main(flist, dbname='ftp_files.db', xname='xapian.db', verbose=False): ''' Main method: dispatches tasks to catalogue and index remote FTP servers. ''' db = Database(dbname) indexer = Indexer(xname, writeable=True) # Read list of remote FTP servers servers = [] with open(flist) as f: servers = f.read().splitlines() for server in servers: if verbose: print "Scanning: %s" % server # Record all files on a remote server if not enumerate_files(server, db): print "Could not enumerate files on %s" % server # Download text and add to corpus if not index_content(server, indexer, db): print "Could not index %s" % server if verbose: print "\nCataloguing and indexing complete." # cleanup indexer.close() db.close()
def __main__(argv): #%% logger = logging.getLogger(__name__) logger.info("VECTOR MODEL INFORMATION RETRIEVAL SYSTEM START") gli = InvertedIndexGenerator(GLI_CONFIG_FILE) gli.run() gli.write_output() index = Indexer(INDEX_CONFIG_FILE, TfidfVectorizer) index.run() index.write_output() pc = QueryProcessor(PC_CONFIG_FILE) pc.run() pc.write_output() buscador = SearchEngine(BUSCA_CONFIG_FILE, TfidfVectorizer) buscador.run() buscador.write_output() #%% avaliador = Evaluator(AVAL_CONFIG_FILE) avaliador.run() avaliador.write_output() logger.info("VECTOR MODEL INFORMATION RETRIEVAL SYSTEM DONE")
def __init__(self,grid_desc): assert isinstance(grid_desc,(list,tuple)) for gd in grid_desc: assert isinstance(gd,(list,tuple)) assert 3 == len(gd) self.dim = len(grid_desc) self.grid_desc = grid_desc # List of (low,high,num) triples (low,hi,num_cells) = zip(*self.grid_desc) self.lower_bound = np.array(low,dtype=np.double) self.upper_bound = np.array(hi,dtype=np.double) self.num_cells = np.array(num_cells,dtype=np.integer) assert not np.any(self.num_cells <= 0) self.num_nodes = self.num_cells + 1 # Cell dimensions self.delta = (self.upper_bound - self.lower_bound) self.delta /= self.num_cells.astype(np.double) # Initialize the indexer self.cell_indexer = Indexer(self.num_cells) self.node_indexer = Indexer(self.num_nodes) # Fuzz to convert [low,high) to [low,high] self.fuzz = 1e-15
def flush(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) self._corpus.flush() self._artdb.flush() indexer = Indexer(self._indexdb, self._corpus, verbose=self.verbose) for tid in self._loctoindex: indexer.index_loc(tid) indexer.finish() self._loctoindex.clear() return
def recover(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.recover_catalog() self.recover_artdb() self._indexdb.reset() indexer = Indexer(self._indexdb, self._corpus, verbose=verbose) for tid in self._corpus.get_all_locs(): indexer.index_loc(tid) indexer.finish() return
def home_page(request): context = {} search_query = request.GET.get("input") if not search_query == None: indexer = Indexer() print(indexer.search_query_result(search_query)) context['urls'] = indexer.search_query_result(search_query) return render(request,'home.html', context)
def main(): print "Usage: python test_indexer.py <source path> [destination]" source = sys.argv[1] destination = None if len(sys.argv) > 2: destination = sys.argv[2] # initialize a indexer object indexer = Indexer(source, destination) # preprocess the yelp dataset indexer.preprocess() # index the preprocessed data indexer.index()
def main(search_terms): dbname = 'ftp_files.db' db = Database(dbname) xname = 'xapian.db' corpus = Indexer(xname) result = corpus.search(str(search_terms)) print_results(result[0], result[1], result[2], db) # clean up corpus.close() db.close()
def parse_html(url, bs): print 'Start parse html from url: ' + str(url) body = bs.find('body') if body is None: return raw_text = body.get_text() words = get_words_from_raw_text(raw_text) dict_words = get_dict_words(words[:100]) # print dict_words print 'Start Indexing url: ' + str(url) indexer = Indexer(url=url, words=dict_words) indexer.save()
def main(flist, plist="prefix.conf", dbname="ftp_files.db", xname="xapian.db", verbose=False): """ Main method: dispatches tasks to catalogue and index remote FTP servers. """ db = Database(dbname) indexer = Indexer(xname, writeable=True) # Read list of prefixes prefixes = [] with open(plist) as f: prefixes = f.read().splitlines() # Read list of remote FTP servers servers = [] with open(flist) as f: servers = f.read().splitlines() # Compile list of all servers for server in servers[:]: idx = servers.index(server) for prefix in prefixes: servers.insert(idx, prefix + "." + server) for server in servers: if verbose: print "Scanning: %s" % server # Determine if server is a valid FTP site if not is_open_ftp_server(server): continue if verbose: print "\tServer is valid, connecting..." # Record all files on a remote server if not enumerate_files(server, db, verbose=verbose): print "\tCould not enumerate files on %s" % server continue # Download text and add to corpus if not index_content(server, indexer, db, verbose=verbose): print "\tCould not index %s" % server if verbose: print "\nCataloguing and indexing complete." # cleanup indexer.close() db.close()
def index_esri_server(server_id): app.logger.info('Indexing ESRI server %s', server_id) server = EsriServer.query.get(server_id) if not server: app.logger.error('ESRI server %s was not found', server_id) return server.status = 'importing' db.session.add(server) db.session.commit() resulting_status = 'errored' try: indexer = Indexer(app.logger) services = indexer.get_services(server.url) for service in services: service_details = indexer.get_service_details(service.get('url')) db_service = Service( server=server, name=service.get('name'), service_type=service.get('type'), service_data=service_details, ) db.session.add(db_service) layers = service_details.get('layers', []) for layer in layers: db_layer = Layer( service=db_service, name=layer.get('name'), layer_data=layer, ) db.session.add(db_layer) resulting_status = 'imported' except requests.exceptions.RequestException: app.logger.exception('Problem indexing ESRI server %s', server_id) except ValueError: app.logger.exception('Problem indexing ESRI server %s', server_id) server.status = resulting_status server.job_id = None db.session.add(server) db.session.commit()
def main(): global indexer, uploader, sender, receiver, downloader setup_signals() logging.info("Asink client started at %s" % (time.strftime("%a, %d %b %Y %X GMT", time.gmtime()))) #create all threads which will be used to process events indexer = Indexer() uploader = Uploader() sender = Sender() receiver = Receiver() downloader = Downloader() #create and set up queues which are used to pass events between threads uploader_queue = Queue() indexer.uploader_queue = uploader_queue uploader.queue = uploader_queue #set on watcher when initialized sender_queue = Queue() uploader.sender_queue = sender_queue sender.queue = sender_queue downloader_queue = Queue() receiver.downloader_queue = downloader_queue downloader.queue = downloader_queue #setup storage provider storage = setup_storage() uploader.storage = storage.clone() downloader.storage = storage #start all threads watcher.start_watching(uploader_queue) indexer.start() uploader.start() sender.start() receiver.start() downloader.start() #sleep until signaled, which will call sig_handler while True: time.sleep(86400) #= 24 hours just for fun
def __init__(self): # create indexer self.Idx = Indexer() # create two connection instances self.Post = None self.InvIdx = None self.index_fields = []
def main(): """ Main function """ # Download data for NLTK if not already done #nltk.download('all') # Read imdb = Indexer() imdb_file = 'data/data.json' logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) logging.info('Reading file %s' % imdb_file) imdb.read_file(imdb_file) logging.info('File %s read' % imdb_file) (vocab_size, user_list, movie_list, \ rating_matrix, review_matrix, review_map) = imdb.get_mappings() # Get number of users and movies Users = len(user_list) Movies = len(movie_list) logging.info('No. of users U = %d' % Users) logging.info('No. of movies M = %d' % Movies) # Run Gibbs EM for it in xrange(1,MAX_ITER+1): logging.info('Running iteration %d of Gibbs EM' % it) logging.info('Running E-Step - Gibbs Sampling') gibbs_sampler = GibbsSampler(5,A,2) gibbs_sampler.run(rating_matrix) logging.info('Running M-Step - Gradient Descent') for i in xrange(1,MAX_OPT_ITER+1): optimizer() # Output Predicted Ratings for u in range(U): for m in range(M): pred_rate = predicted_rating(u, m) print "Predicted Rating of user " + str(u) + " and movie " + str(m) + ": " + str(pred_rate)
def setUp(self): self.__reset_listdir_mapping() self.__reset_isdir_mapping() self.mock_listdir_patcher = patch('os.listdir') self.mock_listdir = self.mock_listdir_patcher.start() self.mock_listdir.side_effect = mock_listdir self.mock_isdir_patcher = patch('os.path.isdir') self.mock_isdir = self.mock_isdir_patcher.start() self.mock_isdir.side_effect = mock_isdir self.mock_copy_patcher = patch('shutil.copy') self.mock_copy = self.mock_copy_patcher.start() self.mock_open_patcher = patch('indexer.open') self.mock_open = self.mock_open_patcher.start() # http://stackoverflow.com/questions/24779893/customizing-unittest-mock-mock-open-for-iteration self.mock_open.return_value = mock_open(read_data='fake-file-contents').return_value self.mock_remove_patcher = patch('os.remove') self.mock_remove = self.mock_remove_patcher.start() self.mock_config = Mock(spec=Config) self.mock_config.haystack_root.return_value = '/root' self.mock_config.staging_root.return_value = '/root/staging' self.mock_config.thumbnail_path_pattern.return_value = '/root/thumbnails/%Y/%M/%D' self.mock_config.picture_path_pattern.return_value = '/root/pictures/%Y/%M/%D' self.mock_config.video_path_pattern.return_value = '/root/videos/%Y/%M/%D' self.mock_config.staging_directory.side_effect = mock_staging_dir self.mock_metadata_helper = Mock(spec=MetadataHelper) self.mock_metadata_helper.get_date_taken.return_value = 1449176000 self.mock_index = Mock(spec=Index) self.mock_index.is_duplicate.return_value = False self.mock_thumbnail_generator = Mock(spec=ThumbnailGenerator) self.mock_util = Mock(spec=Util) self.mock_video_converter = MagicMock(spec=VideoConverter) self.mock_preprocessor = MagicMock(spec=Preprocessor) self.test_model = Indexer(self.mock_config, self.mock_index, self.mock_metadata_helper, self.mock_thumbnail_generator, self.mock_util, self.mock_video_converter, self.mock_preprocessor)
def __init__(self, input_dir): ''' ''' self._input_dir = input_dir self._indexer = Indexer() self._no_workers = mp.cpu_count() - 1 # leave one main process out self._active_workers = mp.Queue(self._no_workers) self._loading_queue = []#mp.Queue() self._viewing_queue = [] self._sections = None self._views = {} self._zoomlevels = None self._client_tile_size = 512
def run(self): """ Starts the main loop""" self._load_configuration() self._init_database() self.pb = PhoneBook(self.dbconn) self.indexer = Indexer() logging.info("Starting IRCThread thread") self.irc_thread = IRCThread(self, self.config['server'], self.config['server_port'], self.config['nickname'], self.config['channel']) self.irc_thread.start() logging.info("Starting webserver") http_thread = HTTPThread(self, ('0.0.0.0', 8090)) http_thread.start() logging.info("Starting main loop") self._main_loop()
def start(self, args): logger = Logger() backend = OutputElasticSearch(args.es_server, args.index) parsers = ParserPlugins() indexer = Indexer(logger, backend, parsers) indexer.ignore_extensions(self.ignore_extensions) if args.check_removed: indexer.check_removed() if args.index_dir: indexer.directory(args.index_dir) if args.truncate: backend.truncate() if args.webserver: import webserver webserver.start(backend)
def __init__(self,node_lists): self.dim = len(node_lists) self.node_lists = np.array(node_lists) # List of np.ndarray cutpoint locations for nl in node_lists: assert nl.ndim == 1 # 1D array assert nl.size >= 2 # At least two nodes assert is_sorted(nl) # Number of cutpoints along each dimension desc = [(nl[0],nl[-1],nl.size) for nl in node_lists] (low,hi,num) = zip(*desc) self.lower_bound = np.array(low) self.upper_bound = np.array(hi) self.num_nodes = np.array(num) self.num_cells = self.num_nodes - 1 # Initialize the indexer self.indexer = Indexer(self.num_nodes) # Fuzz to convert [low,high) to [low,high] self.fuzz = 1e-12
tag_list = [] for sentence in train_sentences: for (word_text, ner_tag) in sentence: word_list.append(word_text) tag_list.append(ner_tag) embedding_by_word = {} for line in open('/Users/konix/Documents/pos_data/glove.6B/glove.6B.300d.txt', 'rb').readlines(): word, embedding_str = line.split(' ', 1) embedding = np.asarray( [float(value_str) for value_str in embedding_str.split()]) embedding_by_word[word] = embedding word_counter = Counter(word_list) word_indexer = Indexer() word_indexer.index_object_list([ word_text for (word_text, word_count) in word_counter.iteritems() if word_count >= 5 ]) word_indexer.index_object_list(embedding_by_word.keys()) unk_word_index = word_indexer.index_object('_UNK_') tag_counter = Counter(tag_list) tag_indexer = Indexer() tag_indexer.index_object_list(tag_counter.keys()) tag_indexer.index_object('_START_') model = Model() sgd = AdamTrainer(model)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ reader = ReadFile('') documents_list = reader.read_fn(fn) # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.after_indexing() self._indexer.save_index("inverted_idx") print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ self.load_index("inverted_idx.pkl") searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
from indexer import Indexer import sys source_file = sys.argv[1] index_file = sys.argv[2] line_number = int(sys.argv[3]) idxr = Indexer(source_file, index_file) with idxr as i: print i.read(line_number)
class SearchEngine: num_of_tweets = 0 # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None def get_num_of_tweets(self): return self.num_of_tweets # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() self.num_of_tweets = len(documents_list) # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) parsed_document.num_of_tweets = self.num_of_tweets number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # TODO: check indexer saving utils.save_obj(self._indexer.inverted_idx, "inverted_idx") # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ inverted_idx = self._indexer.load_index(fn) return inverted_idx # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query, 0) original_query_list = query.split(" ") stop_words = stopwords.words('english') original_query_list = [ w for w in original_query_list if w not in stop_words ] # find long terms and upper case words counter = 0 while counter < len(original_query_list): len_term = 1 word = original_query_list[counter] if word.isupper(): # NBA if word.find("\n") != -1: word = word[:-1] if word.find(".") != -1: word = word[:-1] query_as_list.append(word) elif len(word) > 1 and re.search( '[a-zA-Z]', word) and word[0].isupper(): # upper first char term = word if original_query_list.index(word) + 1 < len( original_query_list): index = original_query_list.index(word) + 1 while index < len(original_query_list): # find all term if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]', original_query_list[index]) and \ original_query_list[index][0].isupper(): new_word2 = original_query_list[index][ 0] + original_query_list[index][1:].lower( ) # Donald Trump term += " " + new_word2 index += 1 len_term += 1 else: break if len_term > 1: query_as_list.append(term) counter += len_term spell_checker = SpellChecker_ranker.correct_query(query_as_list) searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(spell_checker) # TODO: add K results
>>> expand_statement("m10m") "margin: 10em" >>> expand_statement(" m10m") " margin: 10em" >>> expand_property("pad") "padding:" """ import re from definitions import definitions from indexer import Indexer # Indexing index = Indexer() index.index(definitions) # Also see http://www.w3.org/TR/css3-values/ line_expr = re.compile(r'^(\s*)(.*?)$') rule_expr = re.compile(r'^((?:[a-z]+-)*[a-z]+): *([^\s].*?);?$') value_expr = re.compile( r'^([^\.\d-]*)(-?\d*\.?\d+)(x|p[tcx]?|e[mx]?|s|m[ms]?|rem|ch|v[wh]|vmin|max|%|)$' ) semicolon_expr = re.compile(r';\s*$') selectorlike_expr = re.compile( r'.*(link|visited|before|placeholder|root|after|focus|hover|active|checked|selected).*' ) ends_in_brace_expr = re.compile(r'.*\{\s*$')
# -*- coding: utf-8 -*- """ Created on Mon Feb 13 16:04:56 2017 @author: Ashwin MLSALT 5: Question 5 """ from indexer import Indexer from datetime import datetime startTime = datetime.now() indexer = Indexer('decode.ctm') test = indexer.makeGraphDict('grapheme.map') indexer.queries('queries.xml') indexer.hitsHeader('decode-grph.xml') indexer.hitsFile('decode-grph.xml', 'TRUE') #queryMorpy = indexer.queryMorphDict(0,'morph.kwslist.dct') #indexer.initWithMorph('decode.ctm','morph.dct') #indexer.morphQueryToHits('queries.xml', 'morph.kwslist.dct','decode-word-morph.xml', 'TRUE') print(datetime.now() - startTime)
import document_pb2 import struct import gzip import sys from indexer import Indexer from compression import VARBYTE, SIMPLE9 from docreader import DocumentStreamReader def parse_command_line(): parser = argparse.ArgumentParser(description='compressed documents reader') parser.add_argument('args', nargs='+', help='Input files (.gz or plain) to process') return parser.parse_args() if __name__ == '__main__': args = parse_command_line().args compression = args.pop(0) reader = DocumentStreamReader(args) if compression == "simple9": compression = SIMPLE9 else: compression = VARBYTE indexer = Indexer(compression) for doc_id, doc in enumerate(reader): indexer.handle_doc(doc, doc_id + 1) indexer.save_index()
def __init__(self): self.nodes = Indexer() self.links = defaultdict(list) self.redirects = {}
#!/usr/bin/env python import os, sys basedir = os.path.abspath(os.path.dirname(__file__)) libdir = os.path.abspath(os.path.join(basedir, '../lib')); sys.path.append(libdir) from indexer import Indexer if __name__ == '__main__' : ix = Indexer() ix.process()
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config if self._config: if not hasattr(self._config, 'toStem'): self._config.toStem = False if not hasattr(self._config, 'toLemm'): self._config.toLemm = False self._parser = Parse() self._indexer = Indexer(config) self._model = None self.corpus_size = 0 self.load_precomputed_model() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.save_index( self._config.get_output_path()) # Save the inverted_index to disk self.corpus_size = self._indexer.get_docs_count() self.calculate_doc_weight() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ self._model = SpellCheck # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) def calculate_doc_weight(self): """ The method calculates the TF-IDF for each document :return: """ for word in self._indexer.inverted_idx: for doc_id in self._indexer.inverted_idx[word]['posting_list']: normalized_term_tf = self._indexer.inverted_idx[word][ "posting_list"][doc_id][0] term_df = self._indexer.inverted_idx[word]['df'] term_idf = math.log10(self.corpus_size / term_df) # calculate doc's total weight term_weight = normalized_term_tf * term_idf self._indexer.inverted_idx[word]["posting_list"][ doc_id].append(term_weight) term_weight_squared = math.pow(term_weight, 2) self._indexer.docs_index[doc_id][0] += term_weight_squared self._indexer.docs_index[doc_id][0] = round( self._indexer.docs_index[doc_id][0], 3)
for i, docID in enumerate(doclists[0]): flag = True for i in range(1, len(doclists)): if docID not in doclists[i]: flag = False if flag: res.append(docID) return res def tokenize_query(self, q): if self.type == 'simple': return self.tokenize_simple_query(q) else: return [''] def tokenize_simple_query(self, q): return [i.replace(' ', '') for i in q.split('&')] if __name__ == '__main__': indx = Indexer() indx.read() search = Searcher(indx) while True: words = sys.stdin.readline() if not words: break if words[-1] == '\n': words = words[:-1] print words res = search.search(words.decode('utf8').lower()) print len(res) for i in res: print search.indx.urls[i]
def main(): Session() indexer = Indexer() indexer.run()
from settings import config, versions from version import read_readmes app = Flask(__name__, static_url_path='', static_folder='public') app.add_url_rule('/', 'root', lambda: app.send_static_file('index.html')) app.add_url_rule( '/lees-impact-vragenlijst-nl-2019/', 'reading-impact-questionnaire-nl-2019', lambda: app.send_static_file('questionnaire-nl-2019/index.html')) app.add_url_rule( '/reading-impact-questionnaire-en-2020/', 'reading-impact-questionnaire-en-2020', lambda: app.send_static_file('questionnaire-en-2020/index.html')) cors = CORS(app) es_indexer = Indexer(config) readme = read_readmes() def read_boilerplate(version: str) -> Dict[str, str]: with open(versions[version]['boilerplate_file'], 'rt') as fh: return json.load(fh) def read_questions(version: str) -> Dict[str, str]: with open(versions[version]['questions_file'], 'rt') as fh: return json.load(fh) def make_response(response_data: Union[List[Dict[str, any]], Dict[str, any]]): return Response(json.dumps(response_data),
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None self.map_list = [] self.prec5_list = [] self.prec10_list = [] self.prec50_list = [] self.prec_total_list = [] self.recall_list = [] # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ print("\nNow Starting search engine 2") total_time = datetime.now() df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) # print("len of inverted: ", len(self._indexer.inverted_idx)) # print("len of posting: ", len(self._indexer.postingDict)) # print("len of dataSet: ", len(self._indexer.benchDataSet)) # end_time = datetime.now() # print('\n ------ Time To Retrieve: {}'.format(end_time - total_time), " ------\n") # # print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) def run_engine_two(self, fn): self.build_index_from_parquet(fn) queries_path = "data\\queries_train.tsv" all_queries = SearchEngine.query_reader( queries_path)["information_need"] for i, q in enumerate(all_queries): print(q) k, docs = self.search(q) # print(docs[:10]) self.check_engine_quality(i + 1, docs[:300]) print() print("Avg map is :", (sum(self.map_list) / len(self.map_list))) @staticmethod def query_reader(queries_path): data = pd.read_csv(queries_path, sep="\t") return data def get_parser(self): return self._parser def check_engine_quality(self, query_num, list_of_docs): """ :param query_num: :param list_of_docs: :return: no return. prints metrics of the query. precision, recall, map. """ benchmark_path = "data\\benchmark_lbls_train.csv" df = pd.read_csv(benchmark_path) df_prec = df[df['query'] == query_num] df_prec = df_prec[df_prec['tweet'].isin(list_of_docs)] dict_for_data = df_prec.set_index('tweet')['y_true'].to_dict() rmv_lst = [] ranking = [] # Add to list for rank for doc in list_of_docs: try: ranking.append(dict_for_data[int(doc)]) except: rmv_lst.append(doc) for d in rmv_lst: list_of_docs.remove(d) data_df = pd.DataFrame({ 'query': query_num, 'tweet': list_of_docs, 'y_true': ranking }) df_rec = df[df['query'] == query_num] recall_total = len(df_rec[df_rec['y_true'] == 1.0]) # print("total Relevant doc found with tag 1 :" , len (data_df[data_df['y_true'] == 1.0])) # print("total NON relevant doc found with tag 0 :" , len (data_df[data_df['y_true'] == 0])) # print("found total of", len(df_prec), "tagged docs") # Calculate metrics and print prec5 = metrics.precision_at_n(data_df, query_num, 5) prec10 = metrics.precision_at_n(data_df, query_num, 10) prec50 = metrics.precision_at_n(data_df, query_num, 50) prec_total = metrics.precision(data_df, True, query_number=query_num) map_of_query = metrics.map(data_df) recall_val = metrics.recall_single(data_df, recall_total, query_num) self.map_list.append(map_of_query) self.prec5_list.append(prec5) self.prec10_list.append(prec10) self.prec50_list.append(prec50) self.prec_total_list.append(prec_total) self.recall_list.append(recall_val) print() print("precision at 5 of query", query_num, "is :", prec5) print("precision at 10 of query", query_num, "is :", prec10) print("precision at 50 of query", query_num, "is :", prec50) print("precision of query", query_num, "is :", prec_total) print("recall of query", query_num, "is :", recall_val) print("map of query", query_num, "is :", map_of_query)
from indexer import Indexer from scraper import scrape import datetime as dt from database_utils import DBSession from database_optimalisation import optimize_my_database as optimize from database_operations import run_operations as operate from database import LastRun from ner import NERserver dbs = DBSession().session ner_server = NERserver() # date = dbs.query(LastRun.date).order_by(LastRun.id.desc()).first()[0] date = dt.date(year=2009, month=1, day=1) print("Indexing...", end="", flush=True) index = Indexer(date, local=True) index.bp_index() print(" finished!") ner_server.start() print("Scraping...", end="", flush=True) scrape(date, what_to_do="references people", local=True) ner_server.stop() print(" finished!") print("Optimizing...", end="", flush=True) optimize() operate() print(" finished!") dbs.add(LastRun(date=dt.date.today()))
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) if parsed_document == {}: # RT continue number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.inverted_idx = { key: val for key, val in self._indexer.inverted_idx.items() if val != 1 } self._indexer.postingDict = { key: val for key, val in self._indexer.postingDict.items() if len(val) != 1 } print('Finished parsing and indexing.') # self._indexer.save_index('idx_bench') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ filename = self._config.google_news_vectors_negative300_path self._model = gensim.models.KeyedVectors.load_word2vec_format( filename, binary=True, datatype=np.float16) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
def run_engine(config): """ :return: """ number_of_documents = 0 sum_of_doc_lengths = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) indexer = Indexer(config, glove_dict) # documents_list = r.read_file(file_name=config.get__corpusPath()) parquet_documents_list = r.read_folder(config.get__corpusPath()) for parquet_file in parquet_documents_list: documents_list = r.read_file(file_name=parquet_file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 sum_of_doc_lengths += parsed_document.doc_length # index the document data indexer.add_new_doc(parsed_document) # saves last posting file after indexer has done adding documents. indexer.save_postings() if len(indexer.doc_posting_dict) > 0: indexer.save_doc_posting() utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path()) if len(indexer.document_posting_covid) > 0: indexer.save_doc_covid() indexer.delete_dict_after_saving() # merges posting files. indexer.merge_chunks() utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path()) dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents } utils.save_dict(dits, 'details', config.get_out_path())
def main(): """ Main function """ # Download data for NLTK if not already done # nltk.download('all') # Read np.random.seed(5) baseline = False ## Make this true if you want to run the baseline, which is a simple latent factor model path_to_save_results = './test/' imdb = Indexer() imdb_file = 'data/clothing_data_small.json' ## path to data file logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) logging.info('Reading file %s' % imdb_file) imdb.read_file(imdb_file) logging.info('File %s read' % imdb_file) ( vocab_size, user_list, # remove movie_list, review_matrix, review_map, user_dict, movie_dict, rating_list, t_mean, movie_reviews, word_dictionary, U, M, R, test_indices) = imdb.get_mappings(path_to_save_results) mul_factor = 0.1 ## Initialize alpha_vu = np.random.normal(0, sigma_u, (U, K)) * mul_factor alpha_bu = np.random.normal(0, sigma_u, (U, 1)) * mul_factor alpha_tu = np.random.normal(0, sigma_u, (U, A)) * mul_factor # User v_u = np.random.normal(0, sigma_u, (U, K)) * mul_factor # Latent factor vector b_u = np.random.normal(0, sigma_bu, (U, 1)) * mul_factor # Common bias vector theta_u = np.random.normal(0, sigma_ua, (U, A)) * mul_factor # Aspect specific vector # Movie v_m = np.random.normal(0, sigma_m, (M, K)) * mul_factor # Latent factor vector b_m = np.random.normal(0, sigma_bm, (M, 1)) * mul_factor # Common bias vector theta_m = np.random.normal(0, sigma_ma, (M, A)) * mul_factor # Aspect specific vector # Common bias b_o = np.random.normal(0, sigma_b0) * mul_factor # Scaling Matrix M_a = np.random.normal(0, sigma_Ma, (A, K)) * mul_factor params = numpy.concatenate( (alpha_vu.flatten('F'), v_u.flatten('F'), alpha_bu.flatten('F'), b_u.flatten('F'), alpha_tu.flatten('F'), theta_u.flatten('F'), v_m.flatten('F'), b_m.flatten('F'), theta_m.flatten('F'), M_a.flatten('F'), np.array([b_o]).flatten('F'))) save_test_rmse = [] # Get number of users and movies Users = len(user_list) Movies = len(movie_list) logging.info('No. of users U = %d' % Users) logging.info('No. of movies M = %d' % Movies) # change gibbs sampler initialization gibbs_sampler = GibbsSampler(vocab_size, review_matrix, rating_list, movie_dict, user_dict, movie_reviews, word_dictionary, U, M, R, test_indices) # Run Gibbs EM for it in range(1, MAX_ITER + 1): print('Running iteration %d of Gibbs EM' % it) print('Running E-Step - Gibbs Sampling') if baseline != True: Nums, Numas, Numa = gibbs_sampler.run(vocab_size, review_matrix, rating_list, user_dict, movie_dict, movie_reviews, word_dictionary, t_mean, params, test_indices, path_to_save_results) else: Nums = np.zeros((R, 2)) Numas = np.zeros((R, A, 2)) Numa = np.zeros((R, A)) print('Running M-Step - Gradient Descent') for i in range(1, MAX_OPT_ITER + 1): params, save_test_rmse = optimizer(Nums, Numas, Numa, rating_list, t_mean, params, U, M, R, test_indices, save_test_rmse) np.save(path_to_save_results + 'params.npy', params) np.save( path_to_save_results + 'performance_notime_medium_noreg_seed5.npy', save_test_rmse)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file self._parser.curr_idx = self.parse_and_index_tweet_list(documents_list, 0) self._indexer.save_index('idx_bench.pkl') print('Finished parsing and indexing.') def parse_and_index_tweet_list(self, documents_list, idx): for document in documents_list: # parse the document self._parser.curr_idx = idx parsed_document = self._parser.parse_doc(document) # add the doucment to indexer here self._indexer.set_idx(idx) self._indexer.add_new_doc(parsed_document) idx += 1 return idx-1 # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ self._model = _Thesaurus() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
def setUp(self): self.indexer = Indexer("database")
class TestMyCode(unittest.TestCase): def setUp(self): self.maxDiff = None self.window = Context_Window( 'The girl named Alina Zakharova is a student', [Position_Plus(0, 4, 20), Position_Plus(0, 9, 30)], 8, 20) def tearDown(self): if hasattr(self, 'search'): del self.search file_list = os.listdir(path=".") for i in file_list: if i == 'database': database_exists = True os.remove(i) elif i.startswith('database.'): database_exists = True os.remove(i) def test_get_window_error(self): with self.assertRaises(TypeError): self.window.get_window(12, '12') def test_get_window_simple(self): self.indexator = Indexer('database') test_file_one = open('test_window_one.txt', 'w') test_file_one.write('Alina Zakharova is a student)))') test_file_one.close() self.indexator.get_index_with_line('test_window_one.txt') del self.indexator self.search = SearchEngine('database') result = windows.Context_Window.get_window('test_window_one.txt', Position_Plus(0, 16, 18), 1) self.win = Context_Window('string', 'positions', 'win_start', 'win_end') self.win.string = 'Alina Zakharova is a student)))' self.win.positions = [Position_Plus(0, 16, 18)] self.win.win_start = 6 self.win.win_end = 20 self.assertEqual(result.string, self.win.string) self.assertEqual(result.positions, self.win.positions) self.assertEqual(result.win_start, self.win.win_start) self.assertEqual(result.win_end, self.win.win_end) self.assertEqual(result, self.win) os.remove('test_window_one.txt') def test_get_window_simple_plus(self): self.indexator = Indexer('database') test_file_one = open('test_window_two.txt', 'w') test_file_one.write('Little Alina Zakharova is a linguist student)))') test_file_one.close() self.indexator.get_index_with_line('test_window_two.txt') del self.indexator self.search = SearchEngine('database') result = windows.Context_Window.get_window('test_window_two.txt', Position_Plus(0, 23, 25), 2) self.win = Context_Window('string', 'positions', 'win_start', 'win_end') self.win.string = 'Little Alina Zakharova is a linguist student)))' self.win.positions = [Position_Plus(0, 23, 25)] self.win.win_start = 7 self.win.win_end = 36 self.assertEqual(result.string, self.win.string) self.assertEqual(result.positions, self.win.positions) self.assertEqual(result.win_start, self.win.win_start) self.assertEqual(result.win_end, self.win.win_end) self.assertEqual(result, self.win) os.remove('test_window_two.txt') def test_get_window_begin(self): self.indexator = Indexer('database') test_file_one = open('test_window_three.txt', 'w') test_file_one.write('Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_window_three.txt') del self.indexator self.search = SearchEngine('database') result = windows.Context_Window.get_window('test_window_three.txt', Position_Plus(0, 0, 5), 1) self.win = Context_Window('string', 'positions', 'win_start', 'win_end') self.win.string = 'Alina Zakharova is a student' self.win.positions = [Position_Plus(0, 0, 5)] self.win.win_start = 0 self.win.win_end = 15 self.assertEqual(result.string, self.win.string) self.assertEqual(result.positions, self.win.positions) self.assertEqual(result.win_start, self.win.win_start) self.assertEqual(result.win_end, self.win.win_end) self.assertEqual(result, self.win) os.remove('test_window_three.txt') def test_get_window_end(self): self.indexator = Indexer('database') test_file_one = open('test_window_four.txt', 'w') test_file_one.write('Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_window_four.txt') del self.indexator self.search = SearchEngine('database') result = windows.Context_Window.get_window('test_window_four.txt', Position_Plus(0, 21, 28), 3) self.win = Context_Window('string', 'positions', 'win_start', 'win_end') self.win.string = 'Alina Zakharova is a student' self.win.positions = [Position_Plus(0, 21, 28)] self.win.win_start = 6 self.win.win_end = 28 self.assertEqual(result.string, self.win.string) self.assertEqual(result.positions, self.win.positions) self.assertEqual(result.win_start, self.win.win_start) self.assertEqual(result.win_end, self.win.win_end) self.assertEqual(result, self.win) os.remove('test_window_four.txt') def test_myError_str_not_found(self): self.indexator = Indexer('database') test_file_one = open('test_window_five.txt', 'w') test_file_one.write('Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_window_five.txt') del self.indexator self.search = SearchEngine('database') with self.assertRaises(TypeError): result = windows.Context_Window.get_window( 'test_window_five.txt', Position_Plus(3, 21, 28), 3) os.remove('test_window_five.txt') def test_united_type_error(self): with self.assertRaises(TypeError): self.window.get_united_window(12, 'window)))') def test_crossed_type_error(self): with self.assertRaises(TypeError): self.window.is_crossed(12, 'window)))') def test_united_window(self): self.indexator = Indexer('database') test_file_one = open('test_united_window.txt', 'w') test_file_one.write('The girl named Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_united_window.txt') del self.indexator self.search = SearchEngine('database') window_A = windows.Context_Window.get_window('test_united_window.txt', Position_Plus(0, 4, 20), 1) window_B = windows.Context_Window.get_window('test_united_window.txt', Position_Plus(0, 9, 30), 1) window_A.get_united_window(window_B) self.win = windows.Context_Window( 'The girl named Alina Zakharova is a student', [Position_Plus(0, 4, 20), Position_Plus(0, 9, 30)], 9, 20) self.assertEqual(window_A.string, self.win.string) self.assertEqual(window_A.win_start, self.win.win_start) self.assertEqual(window_A.win_end, self.win.win_end) os.remove('test_united_window.txt') def test_is_crossed(self): self.indexator = Indexer('database') test_file_one = open('test_crossed_window.txt', 'w') test_file_one.write('The girl named Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_crossed_window.txt') del self.indexator self.search = SearchEngine('database') window_A = windows.Context_Window.get_window('test_crossed_window.txt', Position_Plus(0, 15, 20), 1) window_B = windows.Context_Window.get_window('test_crossed_window.txt', Position_Plus(0, 8, 14), 1) crossed_AB = window_A.is_crossed(window_B) self.assertEqual(True, crossed_AB) os.remove('test_crossed_window.txt') def test_not_crossed(self): self.indexator = Indexer('database') test_file_one = open('test_not_crossed_window.txt', 'w') test_file_one.write('The girl named Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_not_crossed_window.txt') del self.indexator self.search = SearchEngine('database') window_A = windows.Context_Window.get_window( 'test_not_crossed_window.txt', Position_Plus(0, 31, 33), 1) window_B = windows.Context_Window.get_window( 'test_not_crossed_window.txt', Position_Plus(0, 8, 14), 1) crossed_AB = window_A.is_crossed(window_B) self.assertEqual(False, crossed_AB) os.remove('test_not_crossed_window.txt') def test_extend_window(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window.txt', 'w') test_file_one.write('Alina Zakharova is a student!!') test_file_one.close() self.indexator.get_index_with_line('test_extend_window.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window('test_extend_window.txt', Position_Plus(0, 6, 15), 1) window.extend_window() extended_window = Context_Window('Alina Zakharova is a student!!', [Position_Plus(0, 6, 15)], 0, 30) self.assertEqual(window, extended_window) os.remove('test_extend_window.txt') def test_extend_window_two_words(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window.txt', 'w') test_file_one.write('Alina Zakharova is a student!!') test_file_one.close() self.indexator.get_index_with_line('test_extend_window.txt') del self.indexator self.search = SearchEngine('database') window_one = windows.Context_Window.get_window( 'test_extend_window.txt', Position_Plus(0, 6, 15), 1) window_two = windows.Context_Window.get_window( 'test_extend_window.txt', Position_Plus(0, 0, 5), 1) window_one.get_united_window(window_two) window_one.extend_window() extended_window = Context_Window( 'Alina Zakharova is a student!!', [Position_Plus(0, 6, 15), Position_Plus(0, 0, 5)], 0, 30) self.assertEqual(window_one, extended_window) os.remove('test_extend_window.txt') def test_extend_window_rus(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window_rus.txt', 'w') test_file_one.write( 'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.' ) test_file_one.close() self.indexator.get_index_with_line('test_extend_window_rus.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_extend_window_rus.txt', Position_Plus(0, 28, 36), 1) window.extend_window() extended_window = Context_Window( 'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.', [Position_Plus(0, 28, 36)], 22, 55) self.assertEqual(window, extended_window) def test_extend_window_rus_one(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window_rus.txt', 'w') test_file_one.write('Пьер с грустью слышал над собою насмешки.') test_file_one.close() self.indexator.get_index_with_line('test_extend_window_rus.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_extend_window_rus.txt', Position_Plus(0, 0, 4), 1) window.extend_window() extended_window = Context_Window( 'Пьер с грустью слышал над собою насмешки.', [Position_Plus(0, 0, 4)], 0, 41) self.assertEqual(window, extended_window) def test_extend_window_rus_two(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window_rus.txt', 'w') test_file_one.write( 'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.' ) test_file_one.close() self.indexator.get_index_with_line('test_extend_window_rus.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_extend_window_rus.txt', Position_Plus(0, 34, 38), 1) window.extend_window() extended_window = Context_Window( 'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.', [Position_Plus(0, 34, 38)], 0, 119) self.assertEqual(window, extended_window) def test_already_extended_window(self): self.indexator = Indexer('database') test_file_one = open('test_already_extended_window.txt', 'w') test_file_one.write('Alina Zakharova is a student!!') test_file_one.close() self.indexator.get_index_with_line('test_already_extended_window.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_already_extended_window.txt', Position_Plus(0, 16, 18), 2) os.remove('test_already_extended_window.txt') def test_highlight_window_one(self): self.indexator = Indexer('database') test_file_one = open('test_highlight_window.txt', 'w') test_file_one.write('Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_highlight_window.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window('test_highlight_window.txt', Position_Plus(0, 6, 15), 1) result = window.highlight_window() output_string = 'Alina <b>Zakharova</b> is' self.assertEqual(result, output_string) os.remove('test_highlight_window.txt')
class SearchEngine: """ If need to reindex, please change the status to False in "index_status.log" file """ def __init__(self): # Make an instance of file handler self.file_handler = FileHandler() # Make an instance of indexer self.indexer = Indexer(self.file_handler, file_count_offset=10000) # Check if the indexing is completed. If not, index the documents if not self.file_handler.get_index_status(): self.index() # Open files self.fp_dict = self.file_handler.load_json('./db/fp_locations.json') self.doc_id_dict = self.file_handler.load_json('./db/doc_id.json') self.final_index = open('./db/index.txt') cached_words = self.cache_stop_words() # Cached words are added to the query instance to check during query time self.query = Query(self.file_handler, self.indexer, cached_words) def cache_stop_words(self): cached_words = {} stop_words = set(stopwords.words('english')) # For every index, cache the stop words for line in self.final_index: index = Query.fast_eval(line) if index[0] in stop_words: cached_words[index[0]] = index[1] return cached_words def index(self): start_time = datetime.now() # Index the webpages into partial indexes self.indexer.index('./DEV', restart=True) # Merge partial indexes to one single index self.indexer.merge_indexes('./db') # Calculate the tf_idf scores for each index normalizer = self.indexer.calculate_tf_idf( './db/index.txt', './db/index_tf_idf.txt', self.file_handler.count_number_of_line('./db/index.txt')) # Normalize the tf_idf scores self.indexer.normalize_tf_idf('./db/index_tf_idf.txt', './db/index.txt', normalizer) # Get file pointer locations for each index self.indexer.get_fp_locations('./db/index.txt', './db/fp_locations.json') end_time = datetime.now() process_time = end_time - start_time print("\nStart Time : {}\nEnd Time : {}\nTime elapsed : {}\n".format( start_time, end_time, process_time)) def search(self): # Gets query from the user # Start time is calculated as soon as the query is received. start_time = self.query.get_query() # Process the query self.query.process_query() # Get result of the query result = self.query.get_result() end_time = datetime.now() process_time = end_time - start_time print( "\nStart Time : {}\nEnd Time : {}\nTime elapsed : {} ms\n".format( start_time, end_time, process_time.total_seconds() * 1000)) def run(self): while True: self.search()
parser.add_argument('-test', '--test_mode', action="store_true", help="testing mode") args = parser.parse_args() pattern = '*.' + str(args.electrode) + '.wav' data_dir = args.data_dir out_dir = args.out_dir if not os.path.exists(out_dir): os.makedirs(out_dir) if data_dir[-1] != '/': data_dir += '/' subj = int(data_dir[-2]) assert subj in [1, 2, 3] indexer = Indexer() tain_idx, test_idx = indexer.run(data_dir, pattern, testing=args.test_mode) fs = 400 cd = 240000 * 1000 / fs common_params = dict(sampling_freq=fs, clip_duration=cd, frame_duration=512) tain_params = AudioParams(random_scale_percent=5.0, **common_params) test_params = AudioParams(**common_params) common = dict(target_size=1, nclasses=2) tain_set = 'full' if args.test_mode else 'tain' test_set = 'test' if args.test_mode else 'eval' test_dir = data_dir.replace('train', 'test') if args.test_mode else data_dir tain = DataLoader(set_name=tain_set, media_params=tain_params, index_file=tain_idx,
def __init__(self): self.dbagent = DBAgent() self.dbagent_thread = None self.communication_object = CommunicationObject() self.indexer = Indexer()
from file_reader import FileReader f = FileReader(file_name="input/small_foods.txt") from indexer import Indexer i = Indexer() from query import Query q = Query() a = q.execute_query(['jumbo', 'salted', 'peanuts']) for key, values in a.iteritems(): if values > 2: print key
class Graph: """ Implements a directed graph. self.nodes contains a map from node names to node index. self.links contains a map from node index to all linked nodes. self.redirects contains a map from transient node index to permanent node index. """ def __init__(self): self.nodes = Indexer() self.links = defaultdict(list) self.redirects = {} def __len__(self): return len(self.nodes) def AddLink(self,from_name,to_name): from_index = self.Index(from_name) to_index = self.Index(to_name) self.links[from_index].append(to_index) def AddRedirect(self,from_name,to_name): self.redirects[from_name] = to_name def Index(self,name): index = self.nodes[name] while index in self.redirects: index = self.redirects[index] return index def Links(self,from_index): if isinstance(from_index,int): return self.links[from_index] else: from_index = self.Index(from_index) links = self.links[from_index] link_names = [self.nodes.rev[link] for link in links] return link_names def PageRank(self, reset=0.15, steps_per_iteration=int(1e7), max_iter=100, tol='rank'): """ Computes the Page Rank of each node in the graph. @param reset The probability of making a random jump. @param steps_per_iteration The number of steps before checking convergence. @param max_iter The number of iterations before giving up. @param tol The convergence criteria. If 'rank', then it continues until the ordering of pages has stabilized. If the value is a float, then it continues until all values have changed less than the tolerance. """ pure_pages = list(set(self.nodes.values()) - set(self.redirects)) def random_jump(): return random.choice(pure_pages) if tol=='rank': converged = rank_converged else: converted = tol_converged(tol) page_rank = numpy.zeros(len(self.nodes)) total_steps = 0 current = random_jump() for iter_num in range(max_iter): print('iter_num: ',iter_num) iteration_counts = numpy.zeros(len(self.nodes)) for step_num in range(steps_per_iteration): if step_num%1000000==0: print('step_num: ',step_num) options = self.Links(current) if not options or random.random()<reset: current = random_jump() else: current = random.choice(options) iteration_counts[current] += 1 #Weighted average of new estimation and previous estimation prev = page_rank page_rank = (page_rank*total_steps + iteration_counts)/(total_steps + steps_per_iteration) total_steps += steps_per_iteration if converged(prev,page_rank): break return page_rank def PageRankMatrix(self, reset=0.15, max_iter=100, tol='rank'): """ Computes the Page Rank of each node in the graph. Does so using matrix multiplication, rather than a random walk. """ try: return self.page_rank except AttributeError: pass if tol=='rank': converged = rank_converged else: converged = tol_converged(tol) num_nodes = len(self.nodes) #Transpose self.links, so it can be used to find links to a page, not just from linked_from = defaultdict(list) for from_node,to_node_list in self.links.items(): for to_node in to_node_list: linked_from[to_node].append((from_node,1/len(to_node_list))) #Find all dangling nodes dangling_nodes = set() for nodenum in range(num_nodes): if nodenum not in self.links or not self.links[nodenum]: dangling_nodes.add(nodenum) page_rank = numpy.ones(num_nodes)/num_nodes for iter_num in range(max_iter): print(iter_num) prev = page_rank page_rank = numpy.zeros(num_nodes) dangling_contrib = (1-reset)*sum(prev[d] for d in dangling_nodes)/num_nodes reset_contrib = reset/num_nodes for to_index in range(num_nodes): link_contrib = (1-reset)*sum(prev[from_index]*weight for from_index,weight in linked_from[to_index]) page_rank[to_index] = link_contrib + dangling_contrib + reset_contrib if converged(prev,page_rank): break self.page_rank = page_rank return page_rank def TopNPages(self,n): ranking = self.PageRankMatrix() node_names = list(self.nodes) node_names.sort(key = lambda name:ranking[self.Index(name)],reverse=True) return node_names[:n] def WriteAllPageRanks(self,filename): ranking = self.PageRankMatrix() node_ranks = [(name,ranking[self.Index(name)]) for name in self.nodes] node_ranks.sort(key = lambda k:k[1],reverse=True) with open(filename,'w') as f: for name,rank in node_ranks: f.write('{}\t{}\n'.format(name,rank)) def ExportCSV(self,filename,n): pages = set(self.TopNPages(n)) with open(filename,'w') as f: f.write('Source,Target\n') for i,page in enumerate(pages): if i%1000==0: print('Saving page',i) for link in self.Links(page): if link in pages: f.write('{},{}\n'.format(page,link))
def main(): train_words = parse_words(open(TRAIN_FILE_PATH, 'rb'), tag_scheme=TAG_SCHEME) train_sentences = split_words_to_sentences(train_words) dev_words = parse_words(open(DEV_FILE_PATH, 'rb'), tag_scheme=TAG_SCHEME) dev_sentences = split_words_to_sentences(dev_words) test_words = parse_words(open(TEST_FILE_PATH, 'rb'), tag_scheme=TAG_SCHEME) external_word_embeddings = {} for line in open( '/Users/konix/Documents/pos_data/glove.6B/glove.6B.100d.txt', 'rb').readlines(): word, embedding_str = line.split(' ', 1) embedding = np.asarray( [float(value_str) for value_str in embedding_str.split()]) external_word_embeddings[word] = embedding word_list = [] char_list = [] tag_list = [] for sentence_ in train_sentences: for word_ in sentence_: word_list.append(word_.text.lower()) tag_list.append(word_.gold_label) char_list.extend(word_.text) word_counter = Counter(word_list) word_indexer = Indexer() word_indexer.index_object_list([ word_text for (word_text, word_count) in word_counter.iteritems() if word_count >= 1 ]) word_indexer.index_object_list(external_word_embeddings.keys()) word_indexer.index_object('_UNK_') char_counter = Counter(char_list) char_indexer = Indexer() char_indexer.index_object_list(char_counter.keys()) tag_counter = Counter(tag_list) tag_indexer = Indexer() tag_indexer.index_object_list(tag_counter.keys()) tagger = BiLstmNerTagger(word_indexer, char_indexer, tag_indexer, external_word_embeddings) del word_list del char_list del tag_list del external_word_embeddings gc.collect() tagger.train(train_sentences, dev_sentences, iterations=50) word_index = 0 while word_index < len(dev_words): sentence = dev_words[word_index].sentence tagger.tag_sentence(sentence) word_index += len(sentence) format_words(open('/tmp/dev_ner', 'wb'), dev_words, tag_scheme=TAG_SCHEME) word_index = 0 while word_index < len(test_words): sentence = test_words[word_index].sentence tagger.tag_sentence(sentence) word_index += len(sentence) format_words(open('/tmp/test_ner', 'wb'), test_words, tag_scheme=TAG_SCHEME)
class RegularGrid(Grid): def __init__(self,grid_desc): assert isinstance(grid_desc,(list,tuple)) for gd in grid_desc: assert isinstance(gd,(list,tuple)) assert 3 == len(gd) self.dim = len(grid_desc) self.grid_desc = grid_desc # List of (low,high,num) triples (low,hi,num_cells) = zip(*self.grid_desc) self.lower_bound = np.array(low,dtype=np.double) self.upper_bound = np.array(hi,dtype=np.double) self.num_cells = np.array(num_cells,dtype=np.integer) assert not np.any(self.num_cells <= 0) self.num_nodes = self.num_cells + 1 # Cell dimensions self.delta = (self.upper_bound - self.lower_bound) self.delta /= self.num_cells.astype(np.double) # Initialize the indexer self.cell_indexer = Indexer(self.num_cells) self.node_indexer = Indexer(self.num_nodes) # Fuzz to convert [low,high) to [low,high] self.fuzz = 1e-15 def points_to_cell_coords(self,points): """ Figure out where points are. Returns the cell coordinate. """ assert is_mat(points) (N,D) = points.shape assert D == self.dim # Get the OOB info oob = OutOfBounds() oob.build_from_points(self,points) assert oob.check() raw_coords = np.empty((N,D)) for d in xrange(D): (low,high,num_cells) = self.grid_desc[d] # Transform: [low,high) |-> [0,n) transform = num_cells * (points[:,d] - low) / (high - low) transform += self.fuzz raw_coords[:,d] = np.floor(transform).astype(np.integer) # Add a little fuzz to make sure stuff on the boundary is # mapped correctly # Fuzz top boundary to get [low,high] fuzz_mask = np.logical_and(high <= points[:,d], points[:,d] < high + 2*self.fuzz) raw_coords[fuzz_mask,d] = num_cells - 1 # Counts things just a littttle bit greater than last cell # boundary as part of the last cell raw_coords[oob.mask,:] = np.nan assert is_int(raw_coords) coords = Coordinates(raw_coords,oob) assert coords.check() return coords def points_to_cell_indices(self,points): assert is_mat(points) (N,D) = points.shape cell_coords = self.points_to_cell_coords(points) assert isinstance(cell_coords,Coordinates) assert (N,D) == cell_coords.shape cell_indices = self.cell_indexer.coords_to_indices(cell_coords) assert is_vect(cell_indices) assert (N,) == cell_indices.shape return cell_indices def cell_indices_to_cell_coords(self,cell_indices): cell_coords = self.cell_indexer.indices_to_coords(cell_indices) return cell_coords def cell_indices_to_mid_points(self,cell_indices): assert is_vect(cell_indices) low_points = cell_indices_to_low_points(self,cell_indices) mid_points = low_points + row_vect(0.5 * self.delta) assert is_mat(mid_points) assert mid_points.shape[0] == cell_indices.shape[0] return mid_points def cell_indices_to_low_points(self,cell_indices): assert is_vect(cell_indices) cell_coords = self.cell_indexer.indices_to_coords(cell_indices) assert isinstance(cell_coords,Coordinates) assert cell_coords.check() low_points = self.cell_coords_to_low_points(cell_coords) assert is_mat(low_points) assert cell_coords.shape == low_points.shape return low_points def cell_coords_to_low_points(self,cell_coords): assert isinstance(cell_coords,Coordinates) assert self.dim == cell_coords.dim assert cell_coords.check() C = cell_coords.coords oob = cell_coords.oob assert np.all(np.isnan(C[oob.mask,:])) low_points = row_vect(self.lower_bound) + C * row_vect(self.delta) assert is_mat(low_points) assert np.all(np.isnan(low_points[oob.mask,:])) assert cell_coords.shape == low_points.shape return low_points def node_indices_to_node_points(self,node_indices): assert is_vect(node_indices) (N,) = node_indices.shape node_coords = self.node_indexer.indices_to_coords(node_indices) assert isinstance(node_coords,Coordinates) oob = node_coords.oob C = node_coords.coords assert np.all(np.isnan(C[oob.mask,:])) node_points = row_vect(self.lower_bound) + C * row_vect(self.delta) assert is_mat(node_points) assert np.all(np.isnan(node_points[oob.mask,:])) assert node_coords.shape == node_points.shape return node_points def cell_indices_to_vertex_indices(self,cell_indices): assert is_vect(cell_indices) cell_coords = self.cell_indexer.indices_to_coords(cell_indices) assert isinstance(cell_coords,Coordinates) vertex_indices = self.cell_coords_to_vertex_indices(cell_coords) assert is_mat(vertex_indices) # (N x 2**D) matrix return vertex_indices def cell_coords_to_vertex_indices(self,cell_coords): assert isinstance(cell_coords,Coordinates) (N,D) = cell_coords.shape assert self.dim == D """ The low node index in the cell has the same coords in node-land as the cell in cell-land: | | -o - o- | x | -x - o- | | """ low_vertex = self.node_indexer.coords_to_indices(cell_coords) # Array of index offsets to reach every vertex in cell shift = self.node_indexer.cell_shift() assert (2**D,) == shift.shape vertices = col_vect(low_vertex) + row_vect(shift) assert (N,2**D) == vertices.shape """ Handle out of bound nodes. There is a constant offset for converting cell oob indices to node oob indices. Also the difference between max spatial indices. """ oob = cell_coords.oob if oob.has_oob(): # Figure out the right oob node oob_indices = cell_coords.oob.indices[oob.mask] offset = self.node_indexer.get_num_spatial_nodes() vertices[oob.mask,0] = oob_indices + offset vertices[oob.mask,1:] = np.nan return vertices def points_to_low_vertex_rel_distance(self,points,cell_coords): assert is_mat(points) assert isinstance(cell_coords,Coordinates) (N,D) = points.shape assert (N,D) == cell_coords.shape low_vertex = self.cell_coords_to_low_points(cell_coords) dist = np.empty((N,D)) for d in xrange(D): dist[:,d] = (points[:,d] - low_vertex[:,d]) / self.delta[d] # OOB -> 0 distance from OOB node dist[cell_coords.oob.mask,:] = 0.0 assert np.all(dist >= 0.0) assert np.all(dist <= 1.0) return dist def are_points_oob(self,points): """ Check if points are out-of-bounds """ (N,D) = points.shape assert D == self.dim L = np.any(points < row_vect(self.lower_bound),axis=1) U = np.any(points > row_vect(self.upper_bound) + self.fuzz,axis=1) assert (N,) == L.shape assert (N,) == U.shape return np.logical_or(L,U)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # self._indexer.save_index("idx_bench.pkl") # # indexer_dic = utils.load_obj("idx_bench") # self._indexer.save_index("idx.pkl") # TODO - we need submit this indexer_dic = utils.load_obj("idx") # TODO - we need submit this localMethod = True globalMethod = False wordNet = False spellChecker = False if localMethod: indexer_dic["local"] = True if wordNet: indexer_dic["wordnet"] = True if spellChecker: indexer_dic["spellChecker"] = True if globalMethod: docs_dic, Sij_dic = compute_Wi(indexer_dic, globalMethod) indexer_dic["docs"] = docs_dic indexer_dic["global"] = Sij_dic else: docs_dic = compute_Wi(indexer_dic) indexer_dic["docs"] = docs_dic # utils.save_obj(indexer_dic, "idx_bench") utils.save_obj(indexer_dic, "idx") # TODO - we need submit this # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
def make_index(self): idxr = Indexer(self.source_file, self.index_file) idxr.make_index() self.make_trie()
from indexer import Indexer import os if __name__ == "__main__": indexer = Indexer() curpath = os.getcwd() for file in os.listdir(curpath): if file[0:4] == "Wiki": print("Found file " + file) indexer.parse_files(os.path.join(curpath, file)) print("Parsed " + file)
SEED_URL = 'http://mysql12.f4.htw-berlin.de/crawl/' SEED_PAGES = ('d01.html', 'd06.html', 'd08.html') STOP_WORDS = ['d01', 'd02', 'd03', 'd04', 'd05', 'd06', 'd07', 'd08', 'a', 'also', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'do', 'for', 'have', 'is', 'in', 'it', 'of', 'or', 'see', 'so', 'that', 'the', 'this', 'to', 'we'] crawler = Crawler([urljoin(SEED_URL, page) for page in SEED_PAGES]) page_rank = PageRank(crawler.webgraph_in, crawler.webgraph_out) page_rank.build_graph() index = Indexer(crawler.contents, STOP_WORDS) index.build_index() scorer = Scorer(index) print("> SIMPLE SEARCH ENGINE (by Tammo, Tim & Flo)") while True: scores = scorer.calculate_scores(input("\n> query: ")) if not scores: print("your search term does not occur on any page") continue ranked_scores = [(url, score, page_rank.get_rank(url), score * page_rank.get_rank(url)) for url, score in scores.items()]
def get_line(self, line_num): idxr = Indexer(self.source_file, self.index_file) line = None with idxr as i: line = i.read(line_num) return line
class Blog(object): def __init__(self): # create indexer self.Idx = Indexer() # create two connection instances self.Post = None self.InvIdx = None self.index_fields = [] def set_db(self,Blog_DB): self.Post = Blog_DB.posts self.InvIdx = Blog_DB.invidx def set_index_fields(self,fields): if not isinstance(fields, list): raise Exception("Fields must be a list") self.index_fields = fields self.Idx.set_idx_fields(fields) def save_post(self,post): logging.debug('save_post: ' + str(post)) if self.index_fields == []: raise Exception("No fields to index. Please set it first!") if isinstance(post,list): raise Exception("Only accept 1 post") if logging.root.level == logging.DEBUG: post_start_time = datetime.datetime.utcnow() # inserting post to posts collection obj_id = self.Post.insert(post) if logging.root.level == logging.DEBUG: post_end_time = datetime.datetime.utcnow() if obj_id == None: raise Exception("Error saving to mongodb") logging.debug('Saving post to mongo is OK') # strip unnecessary string #obj_id_strip = str(obj_id).strip('ObjectId("').rstrip('")') #logging.debug('strip object_id to: ' + obj_id_strip) if logging.root.level == logging.DEBUG: idx_start_time = datetime.datetime.utcnow() # get word words = self.Idx.index(post) # updating words to inverted index # using loop # TODO: change to bulk update for word in words: #print word #self.InvIdx.update({"word":word},{"$push":{"docs":obj_id_strip}},True) self.InvIdx.update({"word":word},{"$push":{"docs":obj_id}},True) if logging.root.level == logging.DEBUG: idx_end_time = datetime.datetime.utcnow() # print info post_time = post_end_time-post_start_time idx_time = idx_end_time-idx_start_time total_time = post_time + idx_time logging.debug('time to save post: ' +str(post_time.total_seconds())) logging.debug('time to save idx: ' +str(idx_time.total_seconds())) logging.debug('total time: ' +str(total_time.total_seconds())) return obj_id def get_dummy_post(self,number): if (number<0) or (number>4): raise Exception("Choose 1..4") posts = {} posts[1] = "Six people have been shot dead after a Russian lawyer opened fire on his colleagues at a pharmacy company" posts[2] = "Water and Venice usually go together like bees and honey. But not when there's as much rain" posts[3] = "Two men inside the utility truck have a lucky escape after a passing freight train collides with their vehicle" posts[4] = "Super storm Sandy gives New York a historic drenching.\nBattery Park in lower Manhattan floods as record high water" return {"title":"Dummy post "+str(number) ,"content": posts[number], "time":str(datetime.datetime.utcnow())} def clear(self): self.Post.remove() self.InvIdx.remove() def search(self,input_text): # get time: start first query if logging.root.level == logging.DEBUG: query_idx_start_time = datetime.datetime.utcnow() # tokenize query words_text_input = self.Idx.tokenize(text_input) # build query to get doc_ids list_words_text_input = [] for word_text_input in words_text_input: #print word_text_input cond_words_text_input = {"word": word_text_input} list_words_text_input.append(cond_words_text_input) final_words_text_input = {"$or":list_words_text_input} # get doc_ids from inverted index doc_ids = [queryIdx.values()[0] for queryIdx in self.InvIdx.find( final_words_text_input, {"docs" :1 })] # remove duplicate doc_id doc_ids = set([doc_id[0] for doc_id in doc_ids]) # get time: end first query & start second query if logging.root.level == logging.DEBUG: query_idx_end_time = datetime.datetime.utcnow() query_col_start_time = query_idx_end_time # build query to get documents by doc_ids list_doc = [] for doc_id in doc_ids: cond_doc = {"_id": ObjectId(doc_id)} list_doc.append(cond_doc) final_doc = {"$or":list_doc} # get post from posts collection docs = self.Post.find(final_doc) if logging.root.level == logging.DEBUG: query_col_end_time = datetime.datetime.utcnow() # print info query_idx_time = query_idx_end_time - query_idx_start_time query_col_time = query_col_end_time - query_col_start_time total_time = query_idx_time + query_col_time logging.debug('time to query invidx: ' +str(query_idx_time.total_seconds())) logging.debug('time to query posts: ' +str(query_col_time.total_seconds())) logging.debug('total query time: ' +str(total_time.total_seconds())) return docs
def main() -> None: Indexer(DOCS_DIR, DOC_SIZE)
from indexer import Indexer import sys source_file = sys.argv[1] index_file = sys.argv[2] idxr = Indexer(source_file, index_file) idxr.make_index() sys.stdout.write("Done !!\n")
def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None
>>> expand_statement("m10m") "margin: 10em" >>> expand_statement(" m10m") " margin: 10em" >>> expand_property("pad") "padding:" """ import re from definitions import definitions from indexer import Indexer # Indexing index = Indexer() index.index(definitions) # Also see http://www.w3.org/TR/css3-values/ line_expr = re.compile(r'^(\s*)(.*?)$') rule_expr = re.compile(r'^((?:[a-z]+-)*[a-z]+): *([^\s].*?);?$') value_expr = re.compile(r'^([^\.\d-]*)(-?\d*\.?\d+)(x|p[tcx]?|e[mx]?|s|m[ms]?|rem|ch|v[wh]|vmin|max|%|)$') semicolon_expr = re.compile(r';\s*$') selectorlike_expr = re.compile(r'.*(link|visited|before|placeholder|root|after|focus|hover|active|checked|selected).*') ends_in_brace_expr = re.compile(r'.*\{\s*$') def expand_statement(line, usecolon=True): """Expands a statement line. Executed when pressing <Enter>. "db" => "display: block" "m3m" => "margin: 3em"
class IrregularGrid(Grid): """ Rectilinear grid from irregular, but sorted, list of node locations """ def __init__(self,node_lists): self.dim = len(node_lists) self.node_lists = np.array(node_lists) # List of np.ndarray cutpoint locations for nl in node_lists: assert nl.ndim == 1 # 1D array assert nl.size >= 2 # At least two nodes assert is_sorted(nl) # Number of cutpoints along each dimension desc = [(nl[0],nl[-1],nl.size) for nl in node_lists] (low,hi,num) = zip(*desc) self.lower_bound = np.array(low) self.upper_bound = np.array(hi) self.num_nodes = np.array(num) self.num_cells = self.num_nodes - 1 # Initialize the indexer self.indexer = Indexer(self.num_nodes) # Fuzz to convert [low,high) to [low,high] self.fuzz = 1e-12 def points_to_cell_coords(self,points): (N,D) = points.shape assert D == self.dim Coords = np.empty((N,D)) for d in xrange(D): # Find the correct position in the dth node list coord = np.searchsorted(self.node_lists[d], points[:,d], side='right') - 1 # The 'right' is important if points are exactly on the node assert (N,) == coord.shape Coords[:,d] = coord # Include the upper boundary ub = self.upper_bound[d] hi_cell = self.num_cells[d] - 1 fuzz_mask = np.logical_and(points[:,d] >= ub, points[:,d] < ub + self.fuzz) Coords[fuzz_mask,d] = hi_cell # Indexer will take care of mapping to correct OOB node #lb = self.lower_bound[d] #oob_mask = np.logical_or(points[:,d] < lb, # points[:,d] >= ub+self.fuzz) #Coords[oob_mask,d] = np.nan return Coords def points_to_indices(self,points): coords = self.points_to_cell_coords(points) return self.indexer.coords_to_indices(coords) def indices_to_lowest_points(self,indices): assert 1 == indices.ndim coords = self.indexer.indices_to_coords(indices) return self.coords_to_lowest_points(coords) def coords_to_lowest_points(self,coords): assert 2 == coords.ndim (N,D) = coords.shape assert self.dim == D points = np.empty((N,D)) for d in xrange(D): points[:,d] = self.node_lists[d,coords[:,d]] return points
class Crawler(object): def __init__(self): self.visited_url = set() self.root_url = None self.indexer = Indexer() def pass_robot_txt(self,url): robot = robotparser.RobotFileParser() robot.set_url(self.root_url) robot.read() return robot.can_fetch('*',url) def define_root_url(self,url): self.root_url = url def add_included_suburls(self, soup): urls = set() refs = soup.findAll('a') for ref in refs: try: href = ref['href'] except Exception: print("Doesn't contains suburl") continue if len(href) < 2: continue if '//' in href: continue if href[0] != '/': continue if self.root_url in href: urls.add(href) urls.add(self.root_url + href) return urls def get_pair_word_and_count(self, soup): def visible(element): if element.parent.name in ['head','script','style','[document]']: return False if re.match('<--.*-->',str(element)): return False if element == '\n': return False return True data = soup.findAll(text = True) visible_text = filter(visible, data) words = list() for text in visible_text: result = re.findall(r'[0-9a-z]+',text.lower()) for res in result: words.append(res) self.indexer.add_words(set(words)) return Counter(words) def visit(self, url, width, depth): if depth<0: return if not self.pass_robot_txt(url): raise Exception("robot.txt founded") current_url = url self.indexer.add_url(current_url) depth = depth - 1 try: html = urllib2.urlopen(url).read() except Exception: print("Can't open this *** url") return soup = BeautifulSoup(html) urls = self.add_included_suburls(soup) for url in urls: if url in self.visited_url: continue if width == 0: break self.visited_url.add(url) width = width -1 self.visit(url,width,depth) words = self.get_pair_word_and_count(soup).iteritems() self.indexer.create_index(words, current_url) def run(self,url,width,depth): self.define_root_url(url) self.visit(url,width,depth)
def __init__(self): self.visited_url = set() self.root_url = None self.indexer = Indexer()