Example #1
0
def main(flist, dbname='ftp_files.db', xname='xapian.db', verbose=False):
    '''
    Main method: dispatches tasks to catalogue and index remote FTP servers.
    '''
    db = Database(dbname)
    indexer = Indexer(xname, writeable=True)
    
    # Read list of remote FTP servers
    servers = []
    with open(flist) as f:
        servers = f.read().splitlines()
    
    for server in servers:
        if verbose: print "Scanning: %s" % server
        
        # Record all files on a remote server
        if not enumerate_files(server, db):
            print "Could not enumerate files on %s" % server
        
        # Download text and add to corpus
        if not index_content(server, indexer, db):
            print "Could not index %s" % server
    
    if verbose: print "\nCataloguing and indexing complete."
    
    # cleanup
    indexer.close()
    db.close()
Example #2
0
def __main__(argv):
    #%%
    logger = logging.getLogger(__name__)
    logger.info("VECTOR MODEL INFORMATION RETRIEVAL SYSTEM START")    
    
    gli = InvertedIndexGenerator(GLI_CONFIG_FILE)
    gli.run()
    gli.write_output()
    
    index = Indexer(INDEX_CONFIG_FILE, TfidfVectorizer)
    index.run()
    index.write_output()
    
    pc = QueryProcessor(PC_CONFIG_FILE)
    pc.run()
    pc.write_output()
    
    buscador = SearchEngine(BUSCA_CONFIG_FILE, TfidfVectorizer)
    buscador.run()
    buscador.write_output()
    #%%
    avaliador = Evaluator(AVAL_CONFIG_FILE)
    avaliador.run()
    avaliador.write_output()
    
    logger.info("VECTOR MODEL INFORMATION RETRIEVAL SYSTEM DONE")     
Example #3
0
    def __init__(self,grid_desc):
        assert isinstance(grid_desc,(list,tuple))
        for gd in grid_desc:
            assert isinstance(gd,(list,tuple))
            assert 3 == len(gd)        
        self.dim = len(grid_desc)
        self.grid_desc = grid_desc # List of (low,high,num) triples

        (low,hi,num_cells) = zip(*self.grid_desc)
        self.lower_bound = np.array(low,dtype=np.double)
        self.upper_bound = np.array(hi,dtype=np.double)
        self.num_cells = np.array(num_cells,dtype=np.integer)
        assert not np.any(self.num_cells <= 0)
        self.num_nodes = self.num_cells + 1

        # Cell dimensions
        self.delta = (self.upper_bound - self.lower_bound)
        self.delta /= self.num_cells.astype(np.double)

        # Initialize the indexer
        self.cell_indexer = Indexer(self.num_cells)
        self.node_indexer = Indexer(self.num_nodes)

        # Fuzz to convert [low,high) to [low,high]
        self.fuzz = 1e-15
Example #4
0
 def flush(self):
   if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
   self._corpus.flush()
   self._artdb.flush()
   indexer = Indexer(self._indexdb, self._corpus, verbose=self.verbose)
   for tid in self._loctoindex:
     indexer.index_loc(tid)
   indexer.finish()
   self._loctoindex.clear()
   return
Example #5
0
 def recover(self):
   if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
   self._corpus.recover_catalog()
   self.recover_artdb()
   self._indexdb.reset()
   indexer = Indexer(self._indexdb, self._corpus, verbose=verbose)
   for tid in self._corpus.get_all_locs():
     indexer.index_loc(tid)
   indexer.finish()
   return
Example #6
0
def home_page(request):

    context = {}
    search_query = request.GET.get("input")

    if not search_query == None:
        indexer = Indexer()
        print(indexer.search_query_result(search_query))
        context['urls'] = indexer.search_query_result(search_query)

    return render(request,'home.html', context)
Example #7
0
def main():
    print "Usage: python test_indexer.py <source path> [destination]"
    source = sys.argv[1]
    destination = None
    if len(sys.argv) > 2:
        destination = sys.argv[2]

    # initialize a indexer object
    indexer = Indexer(source, destination)
    # preprocess the yelp dataset
    indexer.preprocess()
    # index the preprocessed data
    indexer.index()
Example #8
0
def main(search_terms):
    dbname = 'ftp_files.db'
    db = Database(dbname)
    
    xname = 'xapian.db'
    corpus = Indexer(xname)
    
    result = corpus.search(str(search_terms))
    print_results(result[0], result[1], result[2], db)
    
    # clean up
    corpus.close()
    db.close()
Example #9
0
def parse_html(url, bs):
    print 'Start parse html from url: ' + str(url)
    body = bs.find('body')
    if body is None:
        return
    raw_text = body.get_text()
    words = get_words_from_raw_text(raw_text)
    dict_words = get_dict_words(words[:100])

    # print dict_words
    print 'Start Indexing url: ' + str(url)
    indexer = Indexer(url=url, words=dict_words)
    indexer.save()
Example #10
0
def main(flist, plist="prefix.conf", dbname="ftp_files.db", xname="xapian.db", verbose=False):
    """
    Main method: dispatches tasks to catalogue and index remote FTP servers.
    """
    db = Database(dbname)
    indexer = Indexer(xname, writeable=True)

    # Read list of prefixes
    prefixes = []
    with open(plist) as f:
        prefixes = f.read().splitlines()

    # Read list of remote FTP servers
    servers = []
    with open(flist) as f:
        servers = f.read().splitlines()

    # Compile list of all servers
    for server in servers[:]:
        idx = servers.index(server)
        for prefix in prefixes:
            servers.insert(idx, prefix + "." + server)

    for server in servers:
        if verbose:
            print "Scanning: %s" % server

        # Determine if server is a valid FTP site
        if not is_open_ftp_server(server):
            continue

        if verbose:
            print "\tServer is valid, connecting..."

        # Record all files on a remote server
        if not enumerate_files(server, db, verbose=verbose):
            print "\tCould not enumerate files on %s" % server
            continue

        # Download text and add to corpus
        if not index_content(server, indexer, db, verbose=verbose):
            print "\tCould not index %s" % server

    if verbose:
        print "\nCataloguing and indexing complete."

    # cleanup
    indexer.close()
    db.close()
Example #11
0
def index_esri_server(server_id):
    app.logger.info('Indexing ESRI server %s', server_id)
    server = EsriServer.query.get(server_id)

    if not server:
        app.logger.error('ESRI server %s was not found', server_id)
        return

    server.status = 'importing'
    db.session.add(server)
    db.session.commit()

    resulting_status = 'errored'
    try:
        indexer = Indexer(app.logger)
        services = indexer.get_services(server.url)
        for service in services:
            service_details = indexer.get_service_details(service.get('url'))

            db_service = Service(
                server=server,
                name=service.get('name'),
                service_type=service.get('type'),
                service_data=service_details,
            )
            db.session.add(db_service)

            layers = service_details.get('layers', [])
            for layer in layers:
                db_layer = Layer(
                    service=db_service,
                    name=layer.get('name'),
                    layer_data=layer,
                )
                db.session.add(db_layer)
        resulting_status = 'imported'
    except requests.exceptions.RequestException:
        app.logger.exception('Problem indexing ESRI server %s', server_id)
    except ValueError:
        app.logger.exception('Problem indexing ESRI server %s', server_id)

    server.status = resulting_status
    server.job_id = None
    db.session.add(server)
    db.session.commit()
Example #12
0
def main():
    global indexer, uploader, sender, receiver, downloader
    setup_signals()
    logging.info("Asink client started at %s" %
                 (time.strftime("%a, %d %b %Y %X GMT", time.gmtime())))

    #create all threads which will be used to process events
    indexer = Indexer()
    uploader = Uploader()
    sender = Sender()
    receiver = Receiver()
    downloader = Downloader()

    #create and set up queues which are used to pass events between threads
    uploader_queue = Queue()
    indexer.uploader_queue = uploader_queue
    uploader.queue = uploader_queue
    #set on watcher when initialized

    sender_queue = Queue()
    uploader.sender_queue = sender_queue
    sender.queue = sender_queue

    downloader_queue = Queue()
    receiver.downloader_queue = downloader_queue
    downloader.queue = downloader_queue

    #setup storage provider
    storage = setup_storage()
    uploader.storage = storage.clone()
    downloader.storage = storage

    #start all threads
    watcher.start_watching(uploader_queue)
    indexer.start()
    uploader.start()
    sender.start()
    receiver.start()
    downloader.start()

    #sleep until signaled, which will call sig_handler
    while True:
        time.sleep(86400) #= 24 hours just for fun
Example #13
0
	def __init__(self):
		
		# create indexer 
		self.Idx = Indexer()

		# create two connection instances
		self.Post = None
		self.InvIdx = None

		self.index_fields = []
Example #14
0
def main():
    """
    Main function
    """
    # Download data for NLTK if not already done
    #nltk.download('all')

    # Read 
    imdb = Indexer()
    imdb_file = 'data/data.json'
    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
    logging.info('Reading file %s' % imdb_file)
    imdb.read_file(imdb_file)
    logging.info('File %s read' % imdb_file)
    (vocab_size, user_list, movie_list, \
    rating_matrix, review_matrix, review_map) = imdb.get_mappings()

    # Get number of users and movies
    Users = len(user_list)
    Movies = len(movie_list)
    logging.info('No. of users U = %d' % Users)
    logging.info('No. of movies M = %d' % Movies)

    # Run Gibbs EM
    for it in xrange(1,MAX_ITER+1):
        logging.info('Running iteration %d of Gibbs EM' % it)
        logging.info('Running E-Step - Gibbs Sampling')
        gibbs_sampler = GibbsSampler(5,A,2)
        gibbs_sampler.run(rating_matrix)
        logging.info('Running M-Step - Gradient Descent')
        for i in xrange(1,MAX_OPT_ITER+1):
            optimizer()

    # Output Predicted Ratings
    for u in range(U):
        for m in range(M):
            pred_rate = predicted_rating(u, m)
            print "Predicted Rating of user " + str(u) + " and movie " + str(m) + ": " + str(pred_rate)
Example #15
0
    def setUp(self):
        self.__reset_listdir_mapping()
        self.__reset_isdir_mapping()

        self.mock_listdir_patcher = patch('os.listdir')
        self.mock_listdir = self.mock_listdir_patcher.start()
        self.mock_listdir.side_effect = mock_listdir

        self.mock_isdir_patcher = patch('os.path.isdir')
        self.mock_isdir = self.mock_isdir_patcher.start()
        self.mock_isdir.side_effect = mock_isdir

        self.mock_copy_patcher = patch('shutil.copy')
        self.mock_copy = self.mock_copy_patcher.start()

        self.mock_open_patcher = patch('indexer.open')
        self.mock_open = self.mock_open_patcher.start()

        # http://stackoverflow.com/questions/24779893/customizing-unittest-mock-mock-open-for-iteration
        self.mock_open.return_value = mock_open(read_data='fake-file-contents').return_value

        self.mock_remove_patcher = patch('os.remove')
        self.mock_remove = self.mock_remove_patcher.start()

        self.mock_config = Mock(spec=Config)
        self.mock_config.haystack_root.return_value = '/root'
        self.mock_config.staging_root.return_value = '/root/staging'
        self.mock_config.thumbnail_path_pattern.return_value = '/root/thumbnails/%Y/%M/%D'
        self.mock_config.picture_path_pattern.return_value = '/root/pictures/%Y/%M/%D'
        self.mock_config.video_path_pattern.return_value = '/root/videos/%Y/%M/%D'
        self.mock_config.staging_directory.side_effect = mock_staging_dir

        self.mock_metadata_helper = Mock(spec=MetadataHelper)
        self.mock_metadata_helper.get_date_taken.return_value = 1449176000

        self.mock_index = Mock(spec=Index)
        self.mock_index.is_duplicate.return_value = False

        self.mock_thumbnail_generator = Mock(spec=ThumbnailGenerator)

        self.mock_util = Mock(spec=Util)

        self.mock_video_converter = MagicMock(spec=VideoConverter)

        self.mock_preprocessor = MagicMock(spec=Preprocessor)

        self.test_model = Indexer(self.mock_config, self.mock_index, self.mock_metadata_helper,
                                  self.mock_thumbnail_generator, self.mock_util, self.mock_video_converter,
                                  self.mock_preprocessor)
Example #16
0
  def __init__(self, input_dir):
    '''
    '''
    self._input_dir = input_dir
    self._indexer = Indexer()

    self._no_workers = mp.cpu_count() - 1 # leave one main process out
    self._active_workers = mp.Queue(self._no_workers)

    self._loading_queue = []#mp.Queue()
    self._viewing_queue = []

    self._sections = None
    self._views = {}

    self._zoomlevels = None

    self._client_tile_size = 512
Example #17
0
    def run(self):
        """ Starts the main loop"""
        self._load_configuration()
        self._init_database()
        self.pb = PhoneBook(self.dbconn)
        self.indexer = Indexer()

        logging.info("Starting IRCThread thread")
        self.irc_thread = IRCThread(self,
                                    self.config['server'],
                                    self.config['server_port'],
                                    self.config['nickname'],
                                    self.config['channel'])
        self.irc_thread.start()

        logging.info("Starting webserver")
        http_thread = HTTPThread(self, ('0.0.0.0', 8090))
        http_thread.start()

        logging.info("Starting main loop")
        self._main_loop()
Example #18
0
	def start(self, args):
		logger = Logger()
		backend = OutputElasticSearch(args.es_server, args.index)
		parsers = ParserPlugins()
		
		indexer = Indexer(logger, backend, parsers)
		indexer.ignore_extensions(self.ignore_extensions)
		
		if args.check_removed:
			indexer.check_removed()

		if args.index_dir:
			indexer.directory(args.index_dir)

		if args.truncate:
			backend.truncate()

		if args.webserver:
			import webserver
			webserver.start(backend)
Example #19
0
    def __init__(self,node_lists):
        self.dim = len(node_lists)
        self.node_lists = np.array(node_lists)
        # List of np.ndarray cutpoint locations

        for nl in node_lists:
            assert nl.ndim == 1 # 1D array
            assert nl.size >= 2 # At least two nodes
            assert is_sorted(nl)
        
        # Number of cutpoints along each dimension
        desc = [(nl[0],nl[-1],nl.size) for nl in node_lists]
        (low,hi,num) = zip(*desc)
        self.lower_bound = np.array(low)
        self.upper_bound = np.array(hi)
        self.num_nodes = np.array(num)
        
        self.num_cells = self.num_nodes - 1

        # Initialize the indexer
        self.indexer = Indexer(self.num_nodes)

        # Fuzz to convert [low,high) to [low,high]
        self.fuzz = 1e-12
Example #20
0
tag_list = []
for sentence in train_sentences:
    for (word_text, ner_tag) in sentence:
        word_list.append(word_text)
        tag_list.append(ner_tag)

embedding_by_word = {}
for line in open('/Users/konix/Documents/pos_data/glove.6B/glove.6B.300d.txt',
                 'rb').readlines():
    word, embedding_str = line.split(' ', 1)
    embedding = np.asarray(
        [float(value_str) for value_str in embedding_str.split()])
    embedding_by_word[word] = embedding

word_counter = Counter(word_list)
word_indexer = Indexer()
word_indexer.index_object_list([
    word_text for (word_text, word_count) in word_counter.iteritems()
    if word_count >= 5
])
word_indexer.index_object_list(embedding_by_word.keys())
unk_word_index = word_indexer.index_object('_UNK_')

tag_counter = Counter(tag_list)
tag_indexer = Indexer()
tag_indexer.index_object_list(tag_counter.keys())
tag_indexer.index_object('_START_')

model = Model()
sgd = AdamTrainer(model)
Example #21
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        reader = ReadFile('')
        documents_list = reader.read_fn(fn)
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        self._indexer.after_indexing()
        self._indexer.save_index("inverted_idx")
        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and 
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        self.load_index("inverted_idx.pkl")
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
Example #22
0
from indexer import Indexer
import sys

source_file = sys.argv[1]
index_file = sys.argv[2]
line_number = int(sys.argv[3])

idxr = Indexer(source_file, index_file)

with idxr as i:
    print i.read(line_number)
Example #23
0
class SearchEngine:

    num_of_tweets = 0

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    def get_num_of_tweets(self):
        return self.num_of_tweets

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.

    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        self.num_of_tweets = len(documents_list)

        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            parsed_document.num_of_tweets = self.num_of_tweets
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')
        # TODO: check indexer saving
        utils.save_obj(self._indexer.inverted_idx, "inverted_idx")

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        inverted_idx = self._indexer.load_index(fn)
        return inverted_idx

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

        # DO NOT MODIFY THIS SIGNATURE
        # You can change the internal implementation as you see fit.

    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """

        query_as_list = self._parser.parse_sentence(query, 0)
        original_query_list = query.split(" ")
        stop_words = stopwords.words('english')
        original_query_list = [
            w for w in original_query_list if w not in stop_words
        ]
        # find long terms and upper case words
        counter = 0
        while counter < len(original_query_list):
            len_term = 1
            word = original_query_list[counter]
            if word.isupper():  # NBA
                if word.find("\n") != -1:
                    word = word[:-1]
                    if word.find(".") != -1:
                        word = word[:-1]
                query_as_list.append(word)
            elif len(word) > 1 and re.search(
                    '[a-zA-Z]',
                    word) and word[0].isupper():  # upper first char
                term = word
                if original_query_list.index(word) + 1 < len(
                        original_query_list):
                    index = original_query_list.index(word) + 1
                    while index < len(original_query_list):  # find all term
                        if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]',
                                                                             original_query_list[index]) and \
                                original_query_list[index][0].isupper():
                            new_word2 = original_query_list[index][
                                0] + original_query_list[index][1:].lower(
                                )  # Donald Trump
                            term += " " + new_word2
                            index += 1
                            len_term += 1
                        else:
                            break
                    if len_term > 1:
                        query_as_list.append(term)
            counter += len_term

        spell_checker = SpellChecker_ranker.correct_query(query_as_list)
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(spell_checker)  # TODO: add K results
Example #24
0
>>> expand_statement("m10m")
"margin: 10em"

>>> expand_statement("    m10m")
"    margin: 10em"

>>> expand_property("pad")
"padding:"
"""
import re
from definitions import definitions
from indexer import Indexer

# Indexing
index = Indexer()
index.index(definitions)

# Also see http://www.w3.org/TR/css3-values/
line_expr = re.compile(r'^(\s*)(.*?)$')
rule_expr = re.compile(r'^((?:[a-z]+-)*[a-z]+): *([^\s].*?);?$')
value_expr = re.compile(
    r'^([^\.\d-]*)(-?\d*\.?\d+)(x|p[tcx]?|e[mx]?|s|m[ms]?|rem|ch|v[wh]|vmin|max|%|)$'
)
semicolon_expr = re.compile(r';\s*$')
selectorlike_expr = re.compile(
    r'.*(link|visited|before|placeholder|root|after|focus|hover|active|checked|selected).*'
)
ends_in_brace_expr = re.compile(r'.*\{\s*$')

Example #25
0
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 13 16:04:56 2017

@author: Ashwin
MLSALT 5: Question 5
"""

from indexer import Indexer
from datetime import datetime

startTime = datetime.now()

indexer = Indexer('decode.ctm')
test = indexer.makeGraphDict('grapheme.map')
indexer.queries('queries.xml')
indexer.hitsHeader('decode-grph.xml')
indexer.hitsFile('decode-grph.xml', 'TRUE')

#queryMorpy = indexer.queryMorphDict(0,'morph.kwslist.dct')
#indexer.initWithMorph('decode.ctm','morph.dct')
#indexer.morphQueryToHits('queries.xml', 'morph.kwslist.dct','decode-word-morph.xml', 'TRUE')

print(datetime.now() - startTime)
Example #26
0
import document_pb2
import struct
import gzip
import sys

from indexer import Indexer
from compression import VARBYTE, SIMPLE9
from docreader import DocumentStreamReader


def parse_command_line():
    parser = argparse.ArgumentParser(description='compressed documents reader')
    parser.add_argument('args',
                        nargs='+',
                        help='Input files (.gz or plain) to process')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_command_line().args
    compression = args.pop(0)
    reader = DocumentStreamReader(args)
    if compression == "simple9":
        compression = SIMPLE9
    else:
        compression = VARBYTE
    indexer = Indexer(compression)
    for doc_id, doc in enumerate(reader):
        indexer.handle_doc(doc, doc_id + 1)
    indexer.save_index()
Example #27
0
 def __init__(self):
     self.nodes = Indexer()
     self.links = defaultdict(list)
     self.redirects = {}
Example #28
0
File: idx.py Project: vsraptor/pse
#!/usr/bin/env python
import os, sys
basedir = os.path.abspath(os.path.dirname(__file__))
libdir = os.path.abspath(os.path.join(basedir, '../lib'));
sys.path.append(libdir)

from indexer import Indexer

if __name__ == '__main__' :
	ix = Indexer()
	ix.process()
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config

        if self._config:
            if not hasattr(self._config, 'toStem'):
                self._config.toStem = False
            if not hasattr(self._config, 'toLemm'):
                self._config.toLemm = False

        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None
        self.corpus_size = 0
        self.load_precomputed_model()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        self._indexer.save_index(
            self._config.get_output_path())  # Save the inverted_index to disk
        self.corpus_size = self._indexer.get_docs_count()
        self.calculate_doc_weight()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        self._model = SpellCheck

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)

    def calculate_doc_weight(self):
        """
       The method calculates the TF-IDF for each document
       :return:
       """
        for word in self._indexer.inverted_idx:
            for doc_id in self._indexer.inverted_idx[word]['posting_list']:
                normalized_term_tf = self._indexer.inverted_idx[word][
                    "posting_list"][doc_id][0]
                term_df = self._indexer.inverted_idx[word]['df']
                term_idf = math.log10(self.corpus_size / term_df)
                # calculate doc's total weight
                term_weight = normalized_term_tf * term_idf
                self._indexer.inverted_idx[word]["posting_list"][
                    doc_id].append(term_weight)
                term_weight_squared = math.pow(term_weight, 2)
                self._indexer.docs_index[doc_id][0] += term_weight_squared
                self._indexer.docs_index[doc_id][0] = round(
                    self._indexer.docs_index[doc_id][0], 3)
Example #30
0
            for i, docID in enumerate(doclists[0]):
                flag = True
                for i in range(1, len(doclists)):
                    if docID not in doclists[i]: flag = False
                if flag: res.append(docID)
        return res

    def tokenize_query(self, q):
        if self.type == 'simple':
            return self.tokenize_simple_query(q)
        else:
            return ['']

    def tokenize_simple_query(self, q):
        return [i.replace(' ', '') for i in q.split('&')]


if __name__ == '__main__':
    indx = Indexer()
    indx.read()
    search = Searcher(indx)
    while True:
        words = sys.stdin.readline()
        if not words:
            break
        if words[-1] == '\n': words = words[:-1]
        print words
        res = search.search(words.decode('utf8').lower())
        print len(res)
        for i in res:
            print search.indx.urls[i]
def main():
    Session()
    indexer = Indexer()
    indexer.run()
Example #32
0
from settings import config, versions
from version import read_readmes

app = Flask(__name__, static_url_path='', static_folder='public')
app.add_url_rule('/', 'root', lambda: app.send_static_file('index.html'))
app.add_url_rule(
    '/lees-impact-vragenlijst-nl-2019/',
    'reading-impact-questionnaire-nl-2019',
    lambda: app.send_static_file('questionnaire-nl-2019/index.html'))
app.add_url_rule(
    '/reading-impact-questionnaire-en-2020/',
    'reading-impact-questionnaire-en-2020',
    lambda: app.send_static_file('questionnaire-en-2020/index.html'))

cors = CORS(app)
es_indexer = Indexer(config)
readme = read_readmes()


def read_boilerplate(version: str) -> Dict[str, str]:
    with open(versions[version]['boilerplate_file'], 'rt') as fh:
        return json.load(fh)


def read_questions(version: str) -> Dict[str, str]:
    with open(versions[version]['questions_file'], 'rt') as fh:
        return json.load(fh)


def make_response(response_data: Union[List[Dict[str, any]], Dict[str, any]]):
    return Response(json.dumps(response_data),
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None
        self.map_list = []
        self.prec5_list = []
        self.prec10_list = []
        self.prec50_list = []
        self.prec_total_list = []
        self.recall_list = []

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        print("\nNow Starting search engine 2")

        total_time = datetime.now()
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        # print("len of inverted: ", len(self._indexer.inverted_idx))
        # print("len of posting: ", len(self._indexer.postingDict))
        # print("len of dataSet: ", len(self._indexer.benchDataSet))
        # end_time = datetime.now()
        # print('\n ------ Time To Retrieve: {}'.format(end_time - total_time), " ------\n")
        #
        # print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)

    def run_engine_two(self, fn):

        self.build_index_from_parquet(fn)
        queries_path = "data\\queries_train.tsv"

        all_queries = SearchEngine.query_reader(
            queries_path)["information_need"]

        for i, q in enumerate(all_queries):
            print(q)
            k, docs = self.search(q)
            # print(docs[:10])
            self.check_engine_quality(i + 1, docs[:300])
            print()

        print("Avg map is :", (sum(self.map_list) / len(self.map_list)))

    @staticmethod
    def query_reader(queries_path):

        data = pd.read_csv(queries_path, sep="\t")
        return data

    def get_parser(self):
        return self._parser

    def check_engine_quality(self, query_num, list_of_docs):
        """
        :param query_num:
        :param list_of_docs:
        :return: no return. prints metrics of the query. precision, recall, map.
        """

        benchmark_path = "data\\benchmark_lbls_train.csv"
        df = pd.read_csv(benchmark_path)

        df_prec = df[df['query'] == query_num]
        df_prec = df_prec[df_prec['tweet'].isin(list_of_docs)]
        dict_for_data = df_prec.set_index('tweet')['y_true'].to_dict()

        rmv_lst = []

        ranking = []
        # Add to list for rank
        for doc in list_of_docs:
            try:
                ranking.append(dict_for_data[int(doc)])
            except:
                rmv_lst.append(doc)
        for d in rmv_lst:
            list_of_docs.remove(d)

        data_df = pd.DataFrame({
            'query': query_num,
            'tweet': list_of_docs,
            'y_true': ranking
        })

        df_rec = df[df['query'] == query_num]
        recall_total = len(df_rec[df_rec['y_true'] == 1.0])

        # print("total Relevant doc found with tag 1 :" , len (data_df[data_df['y_true'] == 1.0]))
        # print("total NON relevant doc found with tag 0 :" , len (data_df[data_df['y_true'] == 0]))
        # print("found total of", len(df_prec), "tagged docs")
        # Calculate metrics and print
        prec5 = metrics.precision_at_n(data_df, query_num, 5)
        prec10 = metrics.precision_at_n(data_df, query_num, 10)
        prec50 = metrics.precision_at_n(data_df, query_num, 50)
        prec_total = metrics.precision(data_df, True, query_number=query_num)
        map_of_query = metrics.map(data_df)
        recall_val = metrics.recall_single(data_df, recall_total, query_num)
        self.map_list.append(map_of_query)
        self.prec5_list.append(prec5)
        self.prec10_list.append(prec10)
        self.prec50_list.append(prec50)
        self.prec_total_list.append(prec_total)
        self.recall_list.append(recall_val)

        print()
        print("precision at 5 of query", query_num, "is :", prec5)
        print("precision at 10 of query", query_num, "is :", prec10)
        print("precision at 50 of query", query_num, "is :", prec50)
        print("precision of query", query_num, "is :", prec_total)
        print("recall of query", query_num, "is :", recall_val)
        print("map of query", query_num, "is :", map_of_query)
Example #34
0
from indexer import Indexer
from scraper import scrape
import datetime as dt
from database_utils import DBSession
from database_optimalisation import optimize_my_database as optimize
from database_operations import run_operations as operate
from database import LastRun
from ner import NERserver

dbs = DBSession().session
ner_server = NERserver()

# date = dbs.query(LastRun.date).order_by(LastRun.id.desc()).first()[0]
date = dt.date(year=2009, month=1, day=1)
print("Indexing...", end="", flush=True)
index = Indexer(date, local=True)
index.bp_index()
print(" finished!")

ner_server.start()
print("Scraping...", end="", flush=True)
scrape(date, what_to_do="references people", local=True)
ner_server.stop()
print(" finished!")

print("Optimizing...", end="", flush=True)
optimize()
operate()
print(" finished!")

dbs.add(LastRun(date=dt.date.today()))
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            if parsed_document == {}:  # RT
                continue
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        self._indexer.inverted_idx = {
            key: val
            for key, val in self._indexer.inverted_idx.items() if val != 1
        }
        self._indexer.postingDict = {
            key: val
            for key, val in self._indexer.postingDict.items() if len(val) != 1
        }
        print('Finished parsing and indexing.')
        # self._indexer.save_index('idx_bench')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and 
        assign to self._model, which is passed on to the searcher at query time.
        """
        filename = self._config.google_news_vectors_negative300_path
        self._model = gensim.models.KeyedVectors.load_word2vec_format(
            filename, binary=True, datatype=np.float16)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
Example #36
0
def run_engine(config):
    """

    :return:
    """

    number_of_documents = 0
    sum_of_doc_lengths = 0

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem)
    indexer = Indexer(config, glove_dict)
    # documents_list = r.read_file(file_name=config.get__corpusPath())
    parquet_documents_list = r.read_folder(config.get__corpusPath())
    for parquet_file in parquet_documents_list:
        documents_list = r.read_file(file_name=parquet_file)
        # Iterate over every document in the file
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            if parsed_document is None:
                continue
            number_of_documents += 1
            sum_of_doc_lengths += parsed_document.doc_length
            # index the document data
            indexer.add_new_doc(parsed_document)

    # saves last posting file after indexer has done adding documents.
    indexer.save_postings()
    if len(indexer.doc_posting_dict) > 0:
        indexer.save_doc_posting()
    utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path())
    if len(indexer.document_posting_covid) > 0:
        indexer.save_doc_covid()

    indexer.delete_dict_after_saving()

    # merges posting files.
    indexer.merge_chunks()
    utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path())

    dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents }

    utils.save_dict(dits, 'details', config.get_out_path())
Example #37
0
def main():
    """
    Main function
    """
    # Download data for NLTK if not already done
    # nltk.download('all')

    # Read
    np.random.seed(5)
    baseline = False  ## Make this true if you want to run the baseline, which is a simple latent factor model
    path_to_save_results = './test/'

    imdb = Indexer()
    imdb_file = 'data/clothing_data_small.json'  ## path to data file
    logging.basicConfig(format='%(levelname)s: %(message)s',
                        level=logging.INFO)
    logging.info('Reading file %s' % imdb_file)
    imdb.read_file(imdb_file)
    logging.info('File %s read' % imdb_file)

    (
        vocab_size,
        user_list,  # remove
        movie_list,
        review_matrix,
        review_map,
        user_dict,
        movie_dict,
        rating_list,
        t_mean,
        movie_reviews,
        word_dictionary,
        U,
        M,
        R,
        test_indices) = imdb.get_mappings(path_to_save_results)

    mul_factor = 0.1
    ## Initialize
    alpha_vu = np.random.normal(0, sigma_u, (U, K)) * mul_factor
    alpha_bu = np.random.normal(0, sigma_u, (U, 1)) * mul_factor
    alpha_tu = np.random.normal(0, sigma_u, (U, A)) * mul_factor

    # User
    v_u = np.random.normal(0, sigma_u,
                           (U, K)) * mul_factor  # Latent factor vector
    b_u = np.random.normal(0, sigma_bu,
                           (U, 1)) * mul_factor  # Common bias vector
    theta_u = np.random.normal(0, sigma_ua,
                               (U, A)) * mul_factor  # Aspect specific vector

    # Movie
    v_m = np.random.normal(0, sigma_m,
                           (M, K)) * mul_factor  # Latent factor vector
    b_m = np.random.normal(0, sigma_bm,
                           (M, 1)) * mul_factor  # Common bias vector
    theta_m = np.random.normal(0, sigma_ma,
                               (M, A)) * mul_factor  # Aspect specific vector

    # Common bias
    b_o = np.random.normal(0, sigma_b0) * mul_factor

    # Scaling Matrix
    M_a = np.random.normal(0, sigma_Ma, (A, K)) * mul_factor

    params = numpy.concatenate(
        (alpha_vu.flatten('F'), v_u.flatten('F'), alpha_bu.flatten('F'),
         b_u.flatten('F'), alpha_tu.flatten('F'), theta_u.flatten('F'),
         v_m.flatten('F'), b_m.flatten('F'), theta_m.flatten('F'),
         M_a.flatten('F'), np.array([b_o]).flatten('F')))

    save_test_rmse = []
    # Get number of users and movies
    Users = len(user_list)
    Movies = len(movie_list)
    logging.info('No. of users U = %d' % Users)
    logging.info('No. of movies M = %d' % Movies)

    # change gibbs sampler initialization
    gibbs_sampler = GibbsSampler(vocab_size, review_matrix, rating_list,
                                 movie_dict, user_dict, movie_reviews,
                                 word_dictionary, U, M, R, test_indices)

    # Run Gibbs EM
    for it in range(1, MAX_ITER + 1):
        print('Running iteration %d of Gibbs EM' % it)
        print('Running E-Step - Gibbs Sampling')

        if baseline != True:
            Nums, Numas, Numa = gibbs_sampler.run(vocab_size, review_matrix,
                                                  rating_list, user_dict,
                                                  movie_dict, movie_reviews,
                                                  word_dictionary, t_mean,
                                                  params, test_indices,
                                                  path_to_save_results)
        else:
            Nums = np.zeros((R, 2))
            Numas = np.zeros((R, A, 2))
            Numa = np.zeros((R, A))
        print('Running M-Step - Gradient Descent')
        for i in range(1, MAX_OPT_ITER + 1):
            params, save_test_rmse = optimizer(Nums, Numas, Numa, rating_list,
                                               t_mean, params, U, M, R,
                                               test_indices, save_test_rmse)
            np.save(path_to_save_results + 'params.npy', params)
            np.save(
                path_to_save_results +
                'performance_notime_medium_noreg_seed5.npy', save_test_rmse)
Example #38
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        self._parser.curr_idx = self.parse_and_index_tweet_list(documents_list, 0)
        self._indexer.save_index('idx_bench.pkl')
        print('Finished parsing and indexing.')

    def parse_and_index_tweet_list(self, documents_list, idx):

        for document in documents_list:
            # parse the document
            self._parser.curr_idx = idx
            parsed_document = self._parser.parse_doc(document)
            # add the doucment to indexer here
            self._indexer.set_idx(idx)
            self._indexer.add_new_doc(parsed_document)
            idx += 1

        return idx-1

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        self._model = _Thesaurus()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
Example #39
0
 def setUp(self):
     self.indexer = Indexer("database")
class TestMyCode(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.window = Context_Window(
            'The girl named Alina Zakharova is a student',
            [Position_Plus(0, 4, 20),
             Position_Plus(0, 9, 30)], 8, 20)

    def tearDown(self):
        if hasattr(self, 'search'):
            del self.search
        file_list = os.listdir(path=".")
        for i in file_list:
            if i == 'database':
                database_exists = True
                os.remove(i)
            elif i.startswith('database.'):
                database_exists = True
                os.remove(i)

    def test_get_window_error(self):
        with self.assertRaises(TypeError):
            self.window.get_window(12, '12')

    def test_get_window_simple(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_window_one.txt', 'w')
        test_file_one.write('Alina Zakharova is a student)))')
        test_file_one.close()
        self.indexator.get_index_with_line('test_window_one.txt')
        del self.indexator
        self.search = SearchEngine('database')
        result = windows.Context_Window.get_window('test_window_one.txt',
                                                   Position_Plus(0, 16, 18), 1)
        self.win = Context_Window('string', 'positions', 'win_start',
                                  'win_end')
        self.win.string = 'Alina Zakharova is a student)))'
        self.win.positions = [Position_Plus(0, 16, 18)]
        self.win.win_start = 6
        self.win.win_end = 20
        self.assertEqual(result.string, self.win.string)
        self.assertEqual(result.positions, self.win.positions)
        self.assertEqual(result.win_start, self.win.win_start)
        self.assertEqual(result.win_end, self.win.win_end)
        self.assertEqual(result, self.win)
        os.remove('test_window_one.txt')

    def test_get_window_simple_plus(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_window_two.txt', 'w')
        test_file_one.write('Little Alina Zakharova is a linguist student)))')
        test_file_one.close()
        self.indexator.get_index_with_line('test_window_two.txt')
        del self.indexator
        self.search = SearchEngine('database')
        result = windows.Context_Window.get_window('test_window_two.txt',
                                                   Position_Plus(0, 23, 25), 2)
        self.win = Context_Window('string', 'positions', 'win_start',
                                  'win_end')
        self.win.string = 'Little Alina Zakharova is a linguist student)))'
        self.win.positions = [Position_Plus(0, 23, 25)]
        self.win.win_start = 7
        self.win.win_end = 36
        self.assertEqual(result.string, self.win.string)
        self.assertEqual(result.positions, self.win.positions)
        self.assertEqual(result.win_start, self.win.win_start)
        self.assertEqual(result.win_end, self.win.win_end)
        self.assertEqual(result, self.win)
        os.remove('test_window_two.txt')

    def test_get_window_begin(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_window_three.txt', 'w')
        test_file_one.write('Alina Zakharova is a student')
        test_file_one.close()
        self.indexator.get_index_with_line('test_window_three.txt')
        del self.indexator
        self.search = SearchEngine('database')
        result = windows.Context_Window.get_window('test_window_three.txt',
                                                   Position_Plus(0, 0, 5), 1)
        self.win = Context_Window('string', 'positions', 'win_start',
                                  'win_end')
        self.win.string = 'Alina Zakharova is a student'
        self.win.positions = [Position_Plus(0, 0, 5)]
        self.win.win_start = 0
        self.win.win_end = 15
        self.assertEqual(result.string, self.win.string)
        self.assertEqual(result.positions, self.win.positions)
        self.assertEqual(result.win_start, self.win.win_start)
        self.assertEqual(result.win_end, self.win.win_end)
        self.assertEqual(result, self.win)
        os.remove('test_window_three.txt')

    def test_get_window_end(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_window_four.txt', 'w')
        test_file_one.write('Alina Zakharova is a student')
        test_file_one.close()
        self.indexator.get_index_with_line('test_window_four.txt')
        del self.indexator
        self.search = SearchEngine('database')
        result = windows.Context_Window.get_window('test_window_four.txt',
                                                   Position_Plus(0, 21, 28), 3)
        self.win = Context_Window('string', 'positions', 'win_start',
                                  'win_end')
        self.win.string = 'Alina Zakharova is a student'
        self.win.positions = [Position_Plus(0, 21, 28)]
        self.win.win_start = 6
        self.win.win_end = 28
        self.assertEqual(result.string, self.win.string)
        self.assertEqual(result.positions, self.win.positions)
        self.assertEqual(result.win_start, self.win.win_start)
        self.assertEqual(result.win_end, self.win.win_end)
        self.assertEqual(result, self.win)
        os.remove('test_window_four.txt')

    def test_myError_str_not_found(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_window_five.txt', 'w')
        test_file_one.write('Alina Zakharova is a student')
        test_file_one.close()
        self.indexator.get_index_with_line('test_window_five.txt')
        del self.indexator
        self.search = SearchEngine('database')
        with self.assertRaises(TypeError):
            result = windows.Context_Window.get_window(
                'test_window_five.txt', Position_Plus(3, 21, 28), 3)
        os.remove('test_window_five.txt')

    def test_united_type_error(self):
        with self.assertRaises(TypeError):
            self.window.get_united_window(12, 'window)))')

    def test_crossed_type_error(self):
        with self.assertRaises(TypeError):
            self.window.is_crossed(12, 'window)))')

    def test_united_window(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_united_window.txt', 'w')
        test_file_one.write('The girl named Alina Zakharova is a student')
        test_file_one.close()
        self.indexator.get_index_with_line('test_united_window.txt')
        del self.indexator
        self.search = SearchEngine('database')
        window_A = windows.Context_Window.get_window('test_united_window.txt',
                                                     Position_Plus(0, 4, 20),
                                                     1)
        window_B = windows.Context_Window.get_window('test_united_window.txt',
                                                     Position_Plus(0, 9, 30),
                                                     1)
        window_A.get_united_window(window_B)
        self.win = windows.Context_Window(
            'The girl named Alina Zakharova is a student',
            [Position_Plus(0, 4, 20),
             Position_Plus(0, 9, 30)], 9, 20)
        self.assertEqual(window_A.string, self.win.string)
        self.assertEqual(window_A.win_start, self.win.win_start)
        self.assertEqual(window_A.win_end, self.win.win_end)
        os.remove('test_united_window.txt')

    def test_is_crossed(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_crossed_window.txt', 'w')
        test_file_one.write('The girl named Alina Zakharova is a student')
        test_file_one.close()
        self.indexator.get_index_with_line('test_crossed_window.txt')
        del self.indexator
        self.search = SearchEngine('database')
        window_A = windows.Context_Window.get_window('test_crossed_window.txt',
                                                     Position_Plus(0, 15, 20),
                                                     1)
        window_B = windows.Context_Window.get_window('test_crossed_window.txt',
                                                     Position_Plus(0, 8, 14),
                                                     1)
        crossed_AB = window_A.is_crossed(window_B)
        self.assertEqual(True, crossed_AB)
        os.remove('test_crossed_window.txt')

    def test_not_crossed(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_not_crossed_window.txt', 'w')
        test_file_one.write('The girl named Alina Zakharova is a student')
        test_file_one.close()
        self.indexator.get_index_with_line('test_not_crossed_window.txt')
        del self.indexator
        self.search = SearchEngine('database')
        window_A = windows.Context_Window.get_window(
            'test_not_crossed_window.txt', Position_Plus(0, 31, 33), 1)
        window_B = windows.Context_Window.get_window(
            'test_not_crossed_window.txt', Position_Plus(0, 8, 14), 1)
        crossed_AB = window_A.is_crossed(window_B)
        self.assertEqual(False, crossed_AB)
        os.remove('test_not_crossed_window.txt')

    def test_extend_window(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_extend_window.txt', 'w')
        test_file_one.write('Alina Zakharova is a student!!')
        test_file_one.close()
        self.indexator.get_index_with_line('test_extend_window.txt')
        del self.indexator
        self.search = SearchEngine('database')
        window = windows.Context_Window.get_window('test_extend_window.txt',
                                                   Position_Plus(0, 6, 15), 1)
        window.extend_window()
        extended_window = Context_Window('Alina Zakharova is a student!!',
                                         [Position_Plus(0, 6, 15)], 0, 30)
        self.assertEqual(window, extended_window)
        os.remove('test_extend_window.txt')

    def test_extend_window_two_words(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_extend_window.txt', 'w')
        test_file_one.write('Alina Zakharova is a student!!')
        test_file_one.close()
        self.indexator.get_index_with_line('test_extend_window.txt')
        del self.indexator
        self.search = SearchEngine('database')
        window_one = windows.Context_Window.get_window(
            'test_extend_window.txt', Position_Plus(0, 6, 15), 1)
        window_two = windows.Context_Window.get_window(
            'test_extend_window.txt', Position_Plus(0, 0, 5), 1)
        window_one.get_united_window(window_two)
        window_one.extend_window()
        extended_window = Context_Window(
            'Alina Zakharova is a student!!',
            [Position_Plus(0, 6, 15),
             Position_Plus(0, 0, 5)], 0, 30)
        self.assertEqual(window_one, extended_window)
        os.remove('test_extend_window.txt')

    def test_extend_window_rus(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_extend_window_rus.txt', 'w')
        test_file_one.write(
            'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.'
        )
        test_file_one.close()
        self.indexator.get_index_with_line('test_extend_window_rus.txt')
        del self.indexator
        self.search = SearchEngine('database')
        window = windows.Context_Window.get_window(
            'test_extend_window_rus.txt', Position_Plus(0, 28, 36), 1)
        window.extend_window()
        extended_window = Context_Window(
            'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.',
            [Position_Plus(0, 28, 36)], 22, 55)
        self.assertEqual(window, extended_window)

    def test_extend_window_rus_one(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_extend_window_rus.txt', 'w')
        test_file_one.write('Пьер с грустью слышал над собою насмешки.')
        test_file_one.close()
        self.indexator.get_index_with_line('test_extend_window_rus.txt')
        del self.indexator
        self.search = SearchEngine('database')
        window = windows.Context_Window.get_window(
            'test_extend_window_rus.txt', Position_Plus(0, 0, 4), 1)
        window.extend_window()
        extended_window = Context_Window(
            'Пьер с грустью слышал над собою насмешки.',
            [Position_Plus(0, 0, 4)], 0, 41)
        self.assertEqual(window, extended_window)

    def test_extend_window_rus_two(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_extend_window_rus.txt', 'w')
        test_file_one.write(
            'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.'
        )
        test_file_one.close()
        self.indexator.get_index_with_line('test_extend_window_rus.txt')
        del self.indexator
        self.search = SearchEngine('database')
        window = windows.Context_Window.get_window(
            'test_extend_window_rus.txt', Position_Plus(0, 34, 38), 1)
        window.extend_window()
        extended_window = Context_Window(
            'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.',
            [Position_Plus(0, 34, 38)], 0, 119)
        self.assertEqual(window, extended_window)

    def test_already_extended_window(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_already_extended_window.txt', 'w')
        test_file_one.write('Alina Zakharova is a student!!')
        test_file_one.close()
        self.indexator.get_index_with_line('test_already_extended_window.txt')
        del self.indexator
        self.search = SearchEngine('database')
        window = windows.Context_Window.get_window(
            'test_already_extended_window.txt', Position_Plus(0, 16, 18), 2)
        os.remove('test_already_extended_window.txt')

    def test_highlight_window_one(self):
        self.indexator = Indexer('database')
        test_file_one = open('test_highlight_window.txt', 'w')
        test_file_one.write('Alina Zakharova is a student')
        test_file_one.close()
        self.indexator.get_index_with_line('test_highlight_window.txt')
        del self.indexator
        self.search = SearchEngine('database')
        window = windows.Context_Window.get_window('test_highlight_window.txt',
                                                   Position_Plus(0, 6, 15), 1)
        result = window.highlight_window()
        output_string = 'Alina <b>Zakharova</b> is'
        self.assertEqual(result, output_string)
        os.remove('test_highlight_window.txt')
Example #41
0
class SearchEngine:
    """
    If need to reindex, please change the 
    status to False in "index_status.log" file
    """
    def __init__(self):
        # Make an instance of file handler
        self.file_handler = FileHandler()
        # Make an instance of indexer
        self.indexer = Indexer(self.file_handler, file_count_offset=10000)

        # Check if the indexing is completed. If not, index the documents
        if not self.file_handler.get_index_status():
            self.index()

        # Open files
        self.fp_dict = self.file_handler.load_json('./db/fp_locations.json')
        self.doc_id_dict = self.file_handler.load_json('./db/doc_id.json')
        self.final_index = open('./db/index.txt')

        cached_words = self.cache_stop_words()
        # Cached words are added to the query instance to check during query time
        self.query = Query(self.file_handler, self.indexer, cached_words)

    def cache_stop_words(self):
        cached_words = {}
        stop_words = set(stopwords.words('english'))

        # For every index, cache the stop words
        for line in self.final_index:
            index = Query.fast_eval(line)

            if index[0] in stop_words:
                cached_words[index[0]] = index[1]

        return cached_words

    def index(self):
        start_time = datetime.now()

        # Index the webpages into partial indexes
        self.indexer.index('./DEV', restart=True)
        # Merge partial indexes to one single index
        self.indexer.merge_indexes('./db')
        # Calculate the tf_idf scores for each index
        normalizer = self.indexer.calculate_tf_idf(
            './db/index.txt', './db/index_tf_idf.txt',
            self.file_handler.count_number_of_line('./db/index.txt'))
        # Normalize the tf_idf scores
        self.indexer.normalize_tf_idf('./db/index_tf_idf.txt',
                                      './db/index.txt', normalizer)
        # Get file pointer locations for each index
        self.indexer.get_fp_locations('./db/index.txt',
                                      './db/fp_locations.json')

        end_time = datetime.now()
        process_time = end_time - start_time

        print("\nStart Time : {}\nEnd Time : {}\nTime elapsed : {}\n".format(
            start_time, end_time, process_time))

    def search(self):

        # Gets query from the user
        # Start time is calculated as soon as the query is received.
        start_time = self.query.get_query()
        # Process the query
        self.query.process_query()
        # Get result of the query
        result = self.query.get_result()

        end_time = datetime.now()
        process_time = end_time - start_time

        print(
            "\nStart Time : {}\nEnd Time : {}\nTime elapsed : {} ms\n".format(
                start_time, end_time,
                process_time.total_seconds() * 1000))

    def run(self):
        while True:
            self.search()
Example #42
0
parser.add_argument('-test',
                    '--test_mode',
                    action="store_true",
                    help="testing mode")
args = parser.parse_args()
pattern = '*.' + str(args.electrode) + '.wav'
data_dir = args.data_dir
out_dir = args.out_dir
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

if data_dir[-1] != '/':
    data_dir += '/'
subj = int(data_dir[-2])
assert subj in [1, 2, 3]
indexer = Indexer()
tain_idx, test_idx = indexer.run(data_dir, pattern, testing=args.test_mode)

fs = 400
cd = 240000 * 1000 / fs
common_params = dict(sampling_freq=fs, clip_duration=cd, frame_duration=512)
tain_params = AudioParams(random_scale_percent=5.0, **common_params)
test_params = AudioParams(**common_params)
common = dict(target_size=1, nclasses=2)
tain_set = 'full' if args.test_mode else 'tain'
test_set = 'test' if args.test_mode else 'eval'
test_dir = data_dir.replace('train', 'test') if args.test_mode else data_dir

tain = DataLoader(set_name=tain_set,
                  media_params=tain_params,
                  index_file=tain_idx,
Example #43
0
 def __init__(self):
     self.dbagent = DBAgent()
     self.dbagent_thread = None
     self.communication_object = CommunicationObject()
     self.indexer = Indexer()
Example #44
0
from file_reader import FileReader
f = FileReader(file_name="input/small_foods.txt")



from indexer import Indexer
i = Indexer()

from query import Query
q = Query()
a = q.execute_query(['jumbo', 'salted', 'peanuts'])
for key, values in a.iteritems():
    if values > 2:
        print key
Example #45
0
class Graph:
    """
    Implements a directed graph.
    self.nodes contains a map from node names to node index.
    self.links contains a map from node index to all linked nodes.
    self.redirects contains a map from transient node index to permanent node index.
    """
    def __init__(self):
        self.nodes = Indexer()
        self.links = defaultdict(list)
        self.redirects = {}

    def __len__(self):
        return len(self.nodes)

    def AddLink(self,from_name,to_name):
        from_index = self.Index(from_name)
        to_index = self.Index(to_name)
        self.links[from_index].append(to_index)

    def AddRedirect(self,from_name,to_name):
        self.redirects[from_name] = to_name

    def Index(self,name):
        index = self.nodes[name]
        while index in self.redirects:
            index = self.redirects[index]
        return index

    def Links(self,from_index):
        if isinstance(from_index,int):
            return self.links[from_index]
        else:
            from_index = self.Index(from_index)
            links = self.links[from_index]
            link_names = [self.nodes.rev[link] for link in links]
            return link_names

    def PageRank(self, reset=0.15, steps_per_iteration=int(1e7), max_iter=100,
                 tol='rank'):
        """
        Computes the Page Rank of each node in the graph.
        @param reset The probability of making a random jump.
        @param steps_per_iteration The number of steps before checking convergence.
        @param max_iter The number of iterations before giving up.
        @param tol The convergence criteria.
          If 'rank', then it continues until the ordering of pages has stabilized.
          If the value is a float, then it continues until all values have changed less than the tolerance.
        """
        pure_pages = list(set(self.nodes.values()) - set(self.redirects))
        def random_jump():
            return random.choice(pure_pages)

        if tol=='rank':
            converged = rank_converged
        else:
            converted = tol_converged(tol)

        page_rank = numpy.zeros(len(self.nodes))
        total_steps = 0

        current = random_jump()
        for iter_num in range(max_iter):
            print('iter_num: ',iter_num)
            iteration_counts = numpy.zeros(len(self.nodes))
            for step_num in range(steps_per_iteration):
                if step_num%1000000==0:
                    print('step_num: ',step_num)
                options = self.Links(current)
                if not options or random.random()<reset:
                    current = random_jump()
                else:
                    current = random.choice(options)
                iteration_counts[current] += 1
            #Weighted average of new estimation and previous estimation
            prev = page_rank
            page_rank = (page_rank*total_steps + iteration_counts)/(total_steps + steps_per_iteration)
            total_steps += steps_per_iteration
            if converged(prev,page_rank):
                break
        return page_rank

    def PageRankMatrix(self, reset=0.15, max_iter=100, tol='rank'):
        """
        Computes the Page Rank of each node in the graph.
        Does so using matrix multiplication, rather than a random walk.
        """
        try:
            return self.page_rank
        except AttributeError:
            pass

        if tol=='rank':
            converged = rank_converged
        else:
            converged = tol_converged(tol)

        num_nodes = len(self.nodes)
        #Transpose self.links, so it can be used to find links to a page, not just from
        linked_from = defaultdict(list)
        for from_node,to_node_list in self.links.items():
            for to_node in to_node_list:
                linked_from[to_node].append((from_node,1/len(to_node_list)))
        #Find all dangling nodes
        dangling_nodes = set()
        for nodenum in range(num_nodes):
            if nodenum not in self.links or not self.links[nodenum]:
                dangling_nodes.add(nodenum)

        page_rank = numpy.ones(num_nodes)/num_nodes

        for iter_num in range(max_iter):
            print(iter_num)
            prev = page_rank
            page_rank = numpy.zeros(num_nodes)
            dangling_contrib = (1-reset)*sum(prev[d] for d in dangling_nodes)/num_nodes
            reset_contrib = reset/num_nodes
            for to_index in range(num_nodes):
                link_contrib = (1-reset)*sum(prev[from_index]*weight for from_index,weight in linked_from[to_index])
                page_rank[to_index] = link_contrib + dangling_contrib + reset_contrib
            if converged(prev,page_rank):
                break

        self.page_rank = page_rank
        return page_rank


    def TopNPages(self,n):
        ranking = self.PageRankMatrix()
        node_names = list(self.nodes)
        node_names.sort(key = lambda name:ranking[self.Index(name)],reverse=True)
        return node_names[:n]

    def WriteAllPageRanks(self,filename):
        ranking = self.PageRankMatrix()
        node_ranks = [(name,ranking[self.Index(name)]) for name in self.nodes]
        node_ranks.sort(key = lambda k:k[1],reverse=True)
        with open(filename,'w') as f:
            for name,rank in node_ranks:
                f.write('{}\t{}\n'.format(name,rank))

    def ExportCSV(self,filename,n):
        pages = set(self.TopNPages(n))
        with open(filename,'w') as f:
            f.write('Source,Target\n')
            for i,page in enumerate(pages):
                if i%1000==0:
                    print('Saving page',i)
                for link in self.Links(page):
                    if link in pages:
                        f.write('{},{}\n'.format(page,link))
Example #46
0
def main():
    train_words = parse_words(open(TRAIN_FILE_PATH, 'rb'),
                              tag_scheme=TAG_SCHEME)
    train_sentences = split_words_to_sentences(train_words)
    dev_words = parse_words(open(DEV_FILE_PATH, 'rb'), tag_scheme=TAG_SCHEME)
    dev_sentences = split_words_to_sentences(dev_words)
    test_words = parse_words(open(TEST_FILE_PATH, 'rb'), tag_scheme=TAG_SCHEME)

    external_word_embeddings = {}
    for line in open(
            '/Users/konix/Documents/pos_data/glove.6B/glove.6B.100d.txt',
            'rb').readlines():
        word, embedding_str = line.split(' ', 1)
        embedding = np.asarray(
            [float(value_str) for value_str in embedding_str.split()])
        external_word_embeddings[word] = embedding

    word_list = []
    char_list = []
    tag_list = []
    for sentence_ in train_sentences:
        for word_ in sentence_:
            word_list.append(word_.text.lower())
            tag_list.append(word_.gold_label)
            char_list.extend(word_.text)

    word_counter = Counter(word_list)
    word_indexer = Indexer()
    word_indexer.index_object_list([
        word_text for (word_text, word_count) in word_counter.iteritems()
        if word_count >= 1
    ])
    word_indexer.index_object_list(external_word_embeddings.keys())
    word_indexer.index_object('_UNK_')

    char_counter = Counter(char_list)
    char_indexer = Indexer()
    char_indexer.index_object_list(char_counter.keys())

    tag_counter = Counter(tag_list)
    tag_indexer = Indexer()
    tag_indexer.index_object_list(tag_counter.keys())

    tagger = BiLstmNerTagger(word_indexer, char_indexer, tag_indexer,
                             external_word_embeddings)

    del word_list
    del char_list
    del tag_list
    del external_word_embeddings
    gc.collect()

    tagger.train(train_sentences, dev_sentences, iterations=50)

    word_index = 0
    while word_index < len(dev_words):
        sentence = dev_words[word_index].sentence
        tagger.tag_sentence(sentence)
        word_index += len(sentence)
    format_words(open('/tmp/dev_ner', 'wb'), dev_words, tag_scheme=TAG_SCHEME)

    word_index = 0
    while word_index < len(test_words):
        sentence = test_words[word_index].sentence
        tagger.tag_sentence(sentence)
        word_index += len(sentence)
    format_words(open('/tmp/test_ner', 'wb'),
                 test_words,
                 tag_scheme=TAG_SCHEME)
Example #47
0
class RegularGrid(Grid):
    def __init__(self,grid_desc):
        assert isinstance(grid_desc,(list,tuple))
        for gd in grid_desc:
            assert isinstance(gd,(list,tuple))
            assert 3 == len(gd)        
        self.dim = len(grid_desc)
        self.grid_desc = grid_desc # List of (low,high,num) triples

        (low,hi,num_cells) = zip(*self.grid_desc)
        self.lower_bound = np.array(low,dtype=np.double)
        self.upper_bound = np.array(hi,dtype=np.double)
        self.num_cells = np.array(num_cells,dtype=np.integer)
        assert not np.any(self.num_cells <= 0)
        self.num_nodes = self.num_cells + 1

        # Cell dimensions
        self.delta = (self.upper_bound - self.lower_bound)
        self.delta /= self.num_cells.astype(np.double)

        # Initialize the indexer
        self.cell_indexer = Indexer(self.num_cells)
        self.node_indexer = Indexer(self.num_nodes)

        # Fuzz to convert [low,high) to [low,high]
        self.fuzz = 1e-15

    def points_to_cell_coords(self,points):
        """
        Figure out where points are. Returns the cell coordinate.
        """
        assert is_mat(points) 
        (N,D) = points.shape
        assert D == self.dim
        
        # Get the OOB info
        oob = OutOfBounds()
        oob.build_from_points(self,points)
        assert oob.check()
        
        raw_coords = np.empty((N,D))
        for d in xrange(D):
            (low,high,num_cells) = self.grid_desc[d]
            # Transform: [low,high) |-> [0,n)
            transform = num_cells * (points[:,d] - low) / (high - low)
            transform += self.fuzz
            raw_coords[:,d] = np.floor(transform).astype(np.integer)
            # Add a little fuzz to make sure stuff on the boundary is
            # mapped correctly

            # Fuzz top boundary to get [low,high]
            fuzz_mask = np.logical_and(high <= points[:,d],
                                     points[:,d] < high + 2*self.fuzz)
            raw_coords[fuzz_mask,d] = num_cells - 1
            # Counts things just a littttle bit greater than last cell
            # boundary as part of the last cell
        
        raw_coords[oob.mask,:] = np.nan
        assert is_int(raw_coords)
        coords = Coordinates(raw_coords,oob)
        assert coords.check()

        return coords
    
    def points_to_cell_indices(self,points):
        assert is_mat(points)
        (N,D) = points.shape
        
        cell_coords = self.points_to_cell_coords(points)
        assert isinstance(cell_coords,Coordinates)
        assert (N,D) == cell_coords.shape
        
        cell_indices = self.cell_indexer.coords_to_indices(cell_coords)
        assert is_vect(cell_indices)
        assert (N,) == cell_indices.shape
        
        return cell_indices

    def cell_indices_to_cell_coords(self,cell_indices):
        cell_coords = self.cell_indexer.indices_to_coords(cell_indices)
        return cell_coords

    def cell_indices_to_mid_points(self,cell_indices):
        assert is_vect(cell_indices)

        low_points = cell_indices_to_low_points(self,cell_indices)
        mid_points = low_points + row_vect(0.5 * self.delta)
        assert is_mat(mid_points)
        assert mid_points.shape[0] == cell_indices.shape[0]
        
        return mid_points

    def cell_indices_to_low_points(self,cell_indices):
        assert is_vect(cell_indices)
        
        cell_coords = self.cell_indexer.indices_to_coords(cell_indices)
        assert isinstance(cell_coords,Coordinates)
        assert cell_coords.check()
        
        low_points = self.cell_coords_to_low_points(cell_coords)
        assert is_mat(low_points)
        assert cell_coords.shape == low_points.shape

        return low_points
        
        
    def cell_coords_to_low_points(self,cell_coords):
        assert isinstance(cell_coords,Coordinates)
        assert self.dim == cell_coords.dim
        assert cell_coords.check()
        
        C = cell_coords.coords
        oob = cell_coords.oob
        assert np.all(np.isnan(C[oob.mask,:])) 
        low_points = row_vect(self.lower_bound) + C * row_vect(self.delta)
        
        assert is_mat(low_points)
        
        assert np.all(np.isnan(low_points[oob.mask,:])) 
        assert cell_coords.shape == low_points.shape
        return low_points
    
    def node_indices_to_node_points(self,node_indices):
        assert is_vect(node_indices)
        (N,) = node_indices.shape
        
        node_coords = self.node_indexer.indices_to_coords(node_indices)
        assert isinstance(node_coords,Coordinates)
        
        oob = node_coords.oob
        C = node_coords.coords
        assert np.all(np.isnan(C[oob.mask,:]))

        node_points = row_vect(self.lower_bound) + C * row_vect(self.delta)
        assert is_mat(node_points)
        assert np.all(np.isnan(node_points[oob.mask,:]))
        assert node_coords.shape == node_points.shape
        
        return node_points

    def cell_indices_to_vertex_indices(self,cell_indices):
        assert is_vect(cell_indices)
        
        cell_coords = self.cell_indexer.indices_to_coords(cell_indices)
        assert isinstance(cell_coords,Coordinates)
        
        vertex_indices = self.cell_coords_to_vertex_indices(cell_coords)
        assert is_mat(vertex_indices) # (N x 2**D) matrix

        return vertex_indices
        
    def cell_coords_to_vertex_indices(self,cell_coords):
        assert isinstance(cell_coords,Coordinates)
        (N,D) = cell_coords.shape
        assert self.dim == D


        """
        The low node index in the cell has the same coords in node-land
        as the cell in cell-land:
         |   |
        -o - o-
         | x |
        -x - o-
         |   |
        """        
        low_vertex = self.node_indexer.coords_to_indices(cell_coords)

        # Array of index offsets to reach every vertex in cell
        shift = self.node_indexer.cell_shift()
        assert (2**D,) == shift.shape
        
        vertices = col_vect(low_vertex) + row_vect(shift)
        assert (N,2**D) == vertices.shape

        """
        Handle out of bound nodes. There is a constant offset for 
        converting cell oob indices to node oob indices.
        Also the difference between max spatial indices.
        """
        oob = cell_coords.oob
        if oob.has_oob():
            # Figure out the right oob node
            oob_indices = cell_coords.oob.indices[oob.mask]
            offset = self.node_indexer.get_num_spatial_nodes()
            vertices[oob.mask,0] = oob_indices + offset
            vertices[oob.mask,1:] = np.nan

        return vertices

    def points_to_low_vertex_rel_distance(self,points,cell_coords):
        assert is_mat(points)
        assert isinstance(cell_coords,Coordinates)
        (N,D) = points.shape
        assert (N,D) == cell_coords.shape
        
        low_vertex = self.cell_coords_to_low_points(cell_coords)
        
        dist = np.empty((N,D))
        for d in xrange(D):
            dist[:,d] = (points[:,d] - low_vertex[:,d]) / self.delta[d]

        # OOB -> 0 distance from OOB node
        dist[cell_coords.oob.mask,:] = 0.0
        
        assert np.all(dist >= 0.0)
        assert np.all(dist <= 1.0)

        return dist

    def are_points_oob(self,points):
        """
        Check if points are out-of-bounds
        """
        (N,D) = points.shape
        assert D == self.dim

        L = np.any(points < row_vect(self.lower_bound),axis=1)
        U = np.any(points > row_vect(self.upper_bound) + self.fuzz,axis=1)
        assert (N,) == L.shape
        assert (N,) == U.shape

        return np.logical_or(L,U)
Example #48
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0

        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')

        # self._indexer.save_index("idx_bench.pkl")
        #
        # indexer_dic = utils.load_obj("idx_bench")
        #
        self._indexer.save_index("idx.pkl")  # TODO - we need submit this

        indexer_dic = utils.load_obj("idx")  # TODO - we need submit this

        localMethod = True
        globalMethod = False
        wordNet = False
        spellChecker = False

        if localMethod:
            indexer_dic["local"] = True

        if wordNet:
            indexer_dic["wordnet"] = True

        if spellChecker:
            indexer_dic["spellChecker"] = True



        if globalMethod:
            docs_dic, Sij_dic = compute_Wi(indexer_dic, globalMethod)
            indexer_dic["docs"] = docs_dic
            indexer_dic["global"] = Sij_dic
        else:
            docs_dic = compute_Wi(indexer_dic)
            indexer_dic["docs"] = docs_dic

        # utils.save_obj(indexer_dic, "idx_bench")
        utils.save_obj(indexer_dic, "idx")  # TODO - we need submit this



    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """

        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
Example #49
0
    def make_index(self):
        idxr = Indexer(self.source_file, self.index_file)
        idxr.make_index()

        self.make_trie()
Example #50
0
from indexer import Indexer
import os

if __name__ == "__main__":
    indexer = Indexer()
    curpath = os.getcwd()

    for file in os.listdir(curpath):
        if file[0:4] == "Wiki":
            print("Found file " + file)
            indexer.parse_files(os.path.join(curpath, file))
            print("Parsed " + file)
SEED_URL = 'http://mysql12.f4.htw-berlin.de/crawl/'
SEED_PAGES = ('d01.html', 'd06.html', 'd08.html')

STOP_WORDS = ['d01', 'd02', 'd03', 'd04', 'd05', 'd06', 'd07', 'd08',  
'a', 'also', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'do',
'for', 'have', 'is', 'in', 'it', 'of', 'or', 'see', 'so',
'that', 'the', 'this', 'to', 'we']


crawler = Crawler([urljoin(SEED_URL, page) for page in SEED_PAGES])

page_rank = PageRank(crawler.webgraph_in, crawler.webgraph_out)
page_rank.build_graph()

index = Indexer(crawler.contents, STOP_WORDS)
index.build_index()

scorer = Scorer(index)

print("> SIMPLE SEARCH ENGINE (by Tammo, Tim & Flo)")

while True:
    scores = scorer.calculate_scores(input("\n> query: "))

    if not scores:
        print("your search term does not occur on any page")
        continue

    ranked_scores = [(url, score, page_rank.get_rank(url), score * page_rank.get_rank(url)) for url, score in scores.items()]
    
Example #52
0
 def get_line(self, line_num):
     idxr = Indexer(self.source_file, self.index_file)
     line = None
     with idxr as i:
         line =  i.read(line_num)
     return line
Example #53
0
class Blog(object):

	
	def __init__(self):
		
		# create indexer 
		self.Idx = Indexer()

		# create two connection instances
		self.Post = None
		self.InvIdx = None

		self.index_fields = []

	def set_db(self,Blog_DB):
		self.Post = Blog_DB.posts
		self.InvIdx = Blog_DB.invidx

	def set_index_fields(self,fields):

		if not isinstance(fields, list): 
			raise Exception("Fields must be a list")

		self.index_fields = fields
		self.Idx.set_idx_fields(fields)


	def save_post(self,post):
		logging.debug('save_post: ' + str(post))
		if self.index_fields == []:
			raise Exception("No fields to index. Please set it first!")
		if isinstance(post,list): 
			raise Exception("Only accept 1 post")

		
		if logging.root.level == logging.DEBUG:
			post_start_time = datetime.datetime.utcnow()

		# inserting post to posts collection
		obj_id = self.Post.insert(post)

		if logging.root.level == logging.DEBUG:
			post_end_time = datetime.datetime.utcnow()

		
		if obj_id == None:
			raise Exception("Error saving to mongodb")
		logging.debug('Saving post to mongo is OK')

		# strip unnecessary string
		#obj_id_strip = str(obj_id).strip('ObjectId("').rstrip('")')
		#logging.debug('strip object_id to: ' + obj_id_strip)
		
		if logging.root.level == logging.DEBUG:
			idx_start_time = datetime.datetime.utcnow()

		# get word
		words = self.Idx.index(post)		

		# updating words to inverted index
		# using loop
		# TODO: change to bulk update
		for word in words:
			#print word
			#self.InvIdx.update({"word":word},{"$push":{"docs":obj_id_strip}},True)
			self.InvIdx.update({"word":word},{"$push":{"docs":obj_id}},True)
		
		if logging.root.level == logging.DEBUG:
			idx_end_time = datetime.datetime.utcnow()
			# print info
			post_time = post_end_time-post_start_time
			idx_time = idx_end_time-idx_start_time
			total_time = post_time + idx_time
			
			logging.debug('time to save post: ' +str(post_time.total_seconds()))
			logging.debug('time to save idx: ' +str(idx_time.total_seconds()))
			logging.debug('total time: ' +str(total_time.total_seconds()))

		return obj_id
	
	def get_dummy_post(self,number):
		
		if (number<0) or (number>4): 
				raise Exception("Choose 1..4")

		posts = {}
		posts[1] = "Six people have been shot dead after a Russian lawyer opened fire on his colleagues at a pharmacy company"
		posts[2] = "Water and Venice usually go together like bees and honey. But not when there's as much rain"
		posts[3] = "Two men inside the utility truck have a lucky escape after a passing freight train collides with their vehicle"
		posts[4] = "Super storm Sandy gives New York a historic drenching.\nBattery Park in lower Manhattan floods as record high water"

		return {"title":"Dummy post "+str(number) ,"content": posts[number], "time":str(datetime.datetime.utcnow())}

	def clear(self):
		self.Post.remove()
		self.InvIdx.remove()

	def search(self,input_text):


		# get time: start first query
		if logging.root.level == logging.DEBUG:
			query_idx_start_time = datetime.datetime.utcnow()

		# tokenize query
		words_text_input = self.Idx.tokenize(text_input)

		# build query to get doc_ids
		list_words_text_input = []
		for word_text_input in words_text_input:
			#print word_text_input
			cond_words_text_input = {"word": word_text_input}
			list_words_text_input.append(cond_words_text_input)
		final_words_text_input = {"$or":list_words_text_input}
		
		# get doc_ids from inverted index
		doc_ids = [queryIdx.values()[0] for queryIdx in self.InvIdx.find( final_words_text_input, {"docs" :1 })]
		# remove duplicate doc_id
		doc_ids = set([doc_id[0] for doc_id in doc_ids])

		# get time: end first query & start second query
		if logging.root.level == logging.DEBUG:
			query_idx_end_time = datetime.datetime.utcnow()
			query_col_start_time = query_idx_end_time

		# build query to get documents by doc_ids
		list_doc = []
		for doc_id in doc_ids:
			cond_doc = {"_id": ObjectId(doc_id)}
			list_doc.append(cond_doc)
		final_doc = {"$or":list_doc}

		# get post from posts collection
		docs = self.Post.find(final_doc)


		if logging.root.level == logging.DEBUG:
			query_col_end_time = datetime.datetime.utcnow()
			
			# print info
			
			query_idx_time = query_idx_end_time - query_idx_start_time
			query_col_time = query_col_end_time - query_col_start_time
			total_time = query_idx_time + query_col_time
						
			logging.debug('time to query invidx: ' +str(query_idx_time.total_seconds()))
			logging.debug('time to query posts: ' +str(query_col_time.total_seconds()))
			logging.debug('total query time: ' +str(total_time.total_seconds()))

		return docs
Example #54
0
def main() -> None:
    Indexer(DOCS_DIR, DOC_SIZE)
from indexer import Indexer
import sys

source_file = sys.argv[1]
index_file = sys.argv[2]

idxr = Indexer(source_file, index_file)
idxr.make_index()

sys.stdout.write("Done !!\n")
Example #56
0
 def __init__(self, config=None):
     self._config = config
     self._parser = Parse()
     self._indexer = Indexer(config)
     self._model = None
>>> expand_statement("m10m")
"margin: 10em"

>>> expand_statement("    m10m")
"    margin: 10em"

>>> expand_property("pad")
"padding:"
"""
import re
from definitions import definitions
from indexer import Indexer

# Indexing
index = Indexer()
index.index(definitions)

# Also see http://www.w3.org/TR/css3-values/
line_expr = re.compile(r'^(\s*)(.*?)$')
rule_expr = re.compile(r'^((?:[a-z]+-)*[a-z]+): *([^\s].*?);?$')
value_expr = re.compile(r'^([^\.\d-]*)(-?\d*\.?\d+)(x|p[tcx]?|e[mx]?|s|m[ms]?|rem|ch|v[wh]|vmin|max|%|)$')
semicolon_expr = re.compile(r';\s*$')
selectorlike_expr = re.compile(r'.*(link|visited|before|placeholder|root|after|focus|hover|active|checked|selected).*')
ends_in_brace_expr = re.compile(r'.*\{\s*$')

def expand_statement(line, usecolon=True):
    """Expands a statement line. Executed when pressing <Enter>.

        "db"          => "display: block"
        "m3m"         => "margin: 3em"
Example #58
0
class IrregularGrid(Grid):
    """
    Rectilinear grid from irregular, but sorted, list of node locations
    """
    def __init__(self,node_lists):
        self.dim = len(node_lists)
        self.node_lists = np.array(node_lists)
        # List of np.ndarray cutpoint locations

        for nl in node_lists:
            assert nl.ndim == 1 # 1D array
            assert nl.size >= 2 # At least two nodes
            assert is_sorted(nl)
        
        # Number of cutpoints along each dimension
        desc = [(nl[0],nl[-1],nl.size) for nl in node_lists]
        (low,hi,num) = zip(*desc)
        self.lower_bound = np.array(low)
        self.upper_bound = np.array(hi)
        self.num_nodes = np.array(num)
        
        self.num_cells = self.num_nodes - 1

        # Initialize the indexer
        self.indexer = Indexer(self.num_nodes)

        # Fuzz to convert [low,high) to [low,high]
        self.fuzz = 1e-12
    
    def points_to_cell_coords(self,points):
        (N,D) = points.shape
        assert D == self.dim

        Coords = np.empty((N,D))
        for d in xrange(D):
            # Find the correct position in the dth node list
            coord = np.searchsorted(self.node_lists[d],
                                    points[:,d],
                                    side='right') - 1
            # The 'right' is important if points are exactly on the node
            assert (N,) == coord.shape
            Coords[:,d] = coord

            # Include the upper boundary
            ub = self.upper_bound[d]
            hi_cell = self.num_cells[d] - 1
            fuzz_mask = np.logical_and(points[:,d] >= ub,
                                       points[:,d] < ub + self.fuzz)
            Coords[fuzz_mask,d] = hi_cell

            # Indexer will take care of mapping to correct OOB node
            #lb = self.lower_bound[d]
            #oob_mask = np.logical_or(points[:,d] < lb,
            #                         points[:,d] >= ub+self.fuzz)
            #Coords[oob_mask,d] = np.nan
        return Coords

    def points_to_indices(self,points):
        coords = self.points_to_cell_coords(points)
        return self.indexer.coords_to_indices(coords)

    def indices_to_lowest_points(self,indices):
        assert 1 == indices.ndim
        
        coords = self.indexer.indices_to_coords(indices)
        return self.coords_to_lowest_points(coords)
        
    def coords_to_lowest_points(self,coords):
        assert 2 == coords.ndim
        (N,D) = coords.shape
        assert self.dim == D

        points = np.empty((N,D))
        for d in xrange(D):
            points[:,d] = self.node_lists[d,coords[:,d]]

        return points
Example #59
-1
class Crawler(object):

    def __init__(self):
        self.visited_url = set()
        self.root_url = None
        self.indexer = Indexer()

    def pass_robot_txt(self,url):
        robot = robotparser.RobotFileParser()
        robot.set_url(self.root_url)
        robot.read()

        return robot.can_fetch('*',url)

    def define_root_url(self,url):
        self.root_url = url

    def add_included_suburls(self, soup):

        urls = set()

        refs = soup.findAll('a')


        for ref in refs:
            try:
                href = ref['href']
            except Exception:
                print("Doesn't contains suburl")
                continue

            if len(href) < 2:
                continue

            if '//' in href:
                continue

            if href[0] != '/':
                continue

            if self.root_url in href:
                urls.add(href)

            urls.add(self.root_url + href)

        return urls

    def get_pair_word_and_count(self, soup):

        def visible(element):
            if element.parent.name in ['head','script','style','[document]']:
                return False

            if re.match('<--.*-->',str(element)):
                return False

            if element == '\n':
                return False

            return True


        data = soup.findAll(text = True)

        visible_text = filter(visible, data)
        words = list()

        for text in visible_text:
            result = re.findall(r'[0-9a-z]+',text.lower())

            for res in result:
                words.append(res)

        self.indexer.add_words(set(words))

        return Counter(words)


    def visit(self, url, width, depth):

        if depth<0:
            return

        if not self.pass_robot_txt(url):
           raise Exception("robot.txt founded")

        current_url = url
        self.indexer.add_url(current_url)

        depth = depth - 1

        try:
            html = urllib2.urlopen(url).read()
        except Exception:
            print("Can't open this *** url")
            return

        soup = BeautifulSoup(html)

        urls = self.add_included_suburls(soup)

        for url in urls:
            if url in self.visited_url:
                continue

            if width == 0:
                break

            self.visited_url.add(url)
            width = width -1
            self.visit(url,width,depth)

        words = self.get_pair_word_and_count(soup).iteritems()

        self.indexer.create_index(words, current_url)

    def run(self,url,width,depth):
        self.define_root_url(url)
        self.visit(url,width,depth)
Example #60
-1
 def __init__(self):
     self.visited_url = set()
     self.root_url = None
     self.indexer = Indexer()