Ejemplo n.º 1
0
def vectorial_search(querystring, collection_index, weight_type):
    """
    Recherche vectorielle de `querystring` dans `collection_index` en utilisant les poids
    de type `weight_type`. Renvoie les résultats de similarité > 0 (ordonnées par similarité)
    """
    search_results = []  # Resultat de la recherche

    # On indexe la recherche et on crée son vecteur
    query_doc = QueryDocument(querystring)
    query_index = Index([query_doc])
    # On calcule le vecteur de la query par rappport a l'index de la collection
    query_vector = query_index.get_document_vector(query_doc.id, weight_type, collection_index)

    # On calcule la similarité entre la query et chaque document de la collection
    for doc_id in collection_index.documents_ids:
        doc_vector = collection_index.get_document_vector(doc_id, weight_type)
        similarity = cosinus_similarity(query_vector, doc_vector)
        search_result = SearchResult(doc_id, similarity)
        search_results.append(search_result)

    # On trie nos resultats par ordre decroissant de similarité
    search_results = sorted(search_results, key=lambda result: -result.similarity)

    # On revoie les résultats qui ont une similarité d'au moins 15%
    # J'ai tester differents minimus de similarité et 15% semble etre celui donnant
    # filtrant le mieux les resultats (pour les query de reference du dataset)
    return [result for result in search_results if result.similarity > 0.15]
Ejemplo n.º 2
0
	def parseCreateTableString(self, createTableString):
		createTablePattern = re.compile('CREATE TABLE `(?P<name>[a-z_]+)` \((?P<columns>.*?)\) ENGINE=(?P<engine>[a-z]+) (AUTO_INCREMENT=(?P<autoincrement>\d+) )?DEFAULT CHARSET=(?P<charset>[a-z\d]+)',
				re.IGNORECASE | re.DOTALL)
		matches = createTablePattern.match(createTableString)

		if matches is None:
			print "Error:\n" + createTableString

		columns = matches.group('columns').strip().split("\n")
		for index, column in enumerate(columns):
			column = column.strip()
			column = column.strip(',')

			primaryKeyMatch = re.match("^(PRIMARY KEY \((?P<columns>.*)\))", column)
			uniqueKeyMatch = re.match("^(UNIQUE KEY `(?P<key_name>.*?)` \((?P<columns>.*)\))", column)
			keyMatch = re.match("^(KEY `(?P<key_name>.*?)` \((?P<columns>.*)\))", column)

			if primaryKeyMatch is not None:
				indexColumns = self.columnStringsToObjects(Index.parseColumnNamesFromString(primaryKeyMatch.group('columns')))
				self.indexes.append(Index('PRIMARY', 'PRIMARY', indexColumns))
			elif uniqueKeyMatch is not None:
				indexColumns = self.columnStringsToObjects(Index.parseColumnNamesFromString(uniqueKeyMatch.group('columns')))
				self.indexes.append(Index('UNIQUE', uniqueKeyMatch.group('key_name'), indexColumns))
			elif keyMatch is not None:
				indexColumns = self.columnStringsToObjects(Index.parseColumnNamesFromString(keyMatch.group('columns')))
				self.indexes.append(Index('KEY', keyMatch.group('key_name'), indexColumns))
			else:
				self.columns.append(Column.fromString(column))

		self.name = matches.group('name')
		self.autoincrement = matches.group('autoincrement')
Ejemplo n.º 3
0
 def __init__(self, data=None, index=None, name=None, series=None):
     '''
     One-dimensional array with axis labels (including time series).
     
     :param data: (*array_like*) One-dimensional array data.
     :param index: (*list*) Data index list. Values must be unique and hashable, same length as data.
     :param name: (*string*) Series name.
     '''
     if series is None:
         if isinstance(data, (list, tuple)):
             data = minum.array(data)
         if index is None:
             index = range(0, len(data))
         else:
             if len(data) != len(index):
                 raise ValueError('Wrong length of index!')
         if isinstance(index, (MIArray, DimArray)):
             index = index.tolist()
         if isinstance(index, Index):
             self._index = index
         else:
             self._index = Index.factory(index)
         self._data = data
         self._series = MISeries(data.array, self._index._index, name)
     else:
         self._series = series
         self._data = MIArray(self._series.getData())
         self._index = Index.factory(index=self._series.getIndex())
Ejemplo n.º 4
0
def build_index(url, depth):
    index = Index()
    crawler = CustomCrawler(url, depth, index)
    crawler.crawl_all_links()
    index.status()
    print("Страниц просмотрено %d" % len(crawler.visited))
    return index
Ejemplo n.º 5
0
def get_gradient(im, index, border_thickness_steps):
    """
    Fun. calc. radial gradient including thickness of cell edges
    @param im: image (for which grad. will be calc.)
    @param index: indices of pixes sorted by polar coords. (alpha, radius) 
    @param border_thickness_steps: number of steps to cop. grad. - depands on cell border thickness
    @return: gradient matrix for cell
    """
    # index of axis used to find max grad.
    # PL: Indeks pomocniczy osi służący do wyznaczenia maksymalnego gradientu
    max_gradient_along_axis = 2
    # preparing the image limits (called subimage) for which grad. will be computed
    # PL: Wymiary wycinka obrazu, dla którego będzie obliczany gradient
    radius_lengths, angles = index.shape[0], index.shape[1]
    # matrix init
    # for each single step for each border thick. separated grad. is being computed
    # at the end the max. grad values are returned (for all steps and thick.)
    # PL: Inicjacja macierzy dla obliczania gradientów
    # PL: Dla każdego pojedynczego kroku dla zadanej grubości krawędzi komórki obliczany jest osobny gradient
    # PL: Następnie zwracane są maksymalne wartości gradientu w danym punkcie dla wszystkich kroków grubości krawędzi
    gradients_for_steps = np.zeros((radius_lengths, angles, border_thickness_steps), dtype=np.float64)
    # PL: Dla każdego kroku wynikającego z grubości krawędzi komórki:
    # PL: Najmniejszy krok ma rozmiar 1, największy ma rozmiar: ${border_thickness_steps}
    for border_thickness_step in range(1, int(border_thickness_steps) + 1):

        # find beg. and end indices of input matrix for which the gradient will be computed
        # PL: Wyznacz początek i koniec wycinka macierzy, dla którego będzie wyliczany gradient
        matrix_end = radius_lengths - border_thickness_step
        matrix_start = border_thickness_step

        # find beg. and end indices of pix. for which the gradient will be computed
        # PL: Wyznacz początek i koniec wycinka indeksu pikseli, dla którego będzie wyliczany gradient
        starting_index = index[:matrix_end, :]
        ending_index = index[matrix_start:, :]

        # find the spot in matrix where comp. gradient will go
        # PL: Wyznacz początek i koniec wycinka macierzy wynikowej, do którego będzie zapisany obliczony gradient
        intersect_start = int(math.ceil(border_thickness_step / 2.0))
        intersect_end = int(intersect_start + matrix_end)

        # comp. current gradient for selected (sub)image 
        # PL: Wylicz bieżącą wartość gradientu dla wyznaczonego wycinka obrazu
        try:
            current_step_gradient = im[Index.to_numpy(ending_index)] - im[Index.to_numpy(starting_index)]
        except Exception:
            print border_thickness_step
            print radius_lengths
            print matrix_start
            print matrix_end
            print ending_index
            print starting_index

            raise Exception

        current_step_gradient /= np.sqrt(border_thickness_step)
        # Zapisz gradient do wyznaczonego wycinka macierzy wyników
        gradients_for_steps[intersect_start:intersect_end, :, border_thickness_step-1] = current_step_gradient

    return gradients_for_steps.max(axis=max_gradient_along_axis)
Ejemplo n.º 6
0
 def test1(self):
     con = make_dbcon()
     ds = DataStore(con)
     col = ds.collection("users")
     
     i = Index(con, col, 'email')
     
     self.assertEqual(i.name(), 'email')
Ejemplo n.º 7
0
 def init(self, *file):
     self.rules      = [] 
     self.if_index   = Index().init() 
     self.then_index = Index().init()
     
     if file:
         self.load_rules(file[0])
     return self
Ejemplo n.º 8
0
def initialize(facts, kbase):
    known = Index().init()
    for fact in facts:
        known.store(fact, (fact, 'initial'))          # fact, proof
    known.store(['true'], (['true'], 'atomic'))       # if true then...

    for rule in kbase.rules:
        rule['trigger'] = 0 
    return known
Ejemplo n.º 9
0
 def __init__(self, config, lang=None):
     """
     Constructor method
     """
     
     self.config = config
     self.lang = lang
     self.subscribers = Subscribers(config)
     self.index = Index(self.config)
Ejemplo n.º 10
0
def setup(html_file=None, corpus_file=CORPUS_CSV, index_file=INDEX):
    # Preprocessing the html
    createCSV(src_file=html_file)

    # Create index
    idxf2 = Index(corpus=corpus_file)
    if os.path.exists(index_file):
        print("Detected %s exists, removeing ..." % index_file)
        os.remove(index_file)
    idxf2.save(index_file)
Ejemplo n.º 11
0
 def test_passing_stopwords_should_remove_these_words_from_token_list(self):
     index = Index(stopwords=['yes', 'no', ',', '.', '!'])
     index.add_document('coffee', 'Yes, sir! No, Joyce.')
     self.assertEquals(
         index._index,
         {
             'sir': set(['coffee']),
             'joyce': set(['coffee'])
         },
     )
Ejemplo n.º 12
0
    def setUp(self):
        root_path = os.path.dirname(os.path.realpath(__file__)) + "/../"
        try:
            os.unlink(root_path + "data/{{cookiecutter.project_slug}}.json")
            os.unlink(root_path +
                      "data/{{cookiecutter.project_slug}}.json.backup")
        except Exception:
            pass

        self.index = Index(connection=Mockup())
Ejemplo n.º 13
0
    def test_asdf(self):
        index = Index()
        index[PurePath('a')] = '3'
        self.assertEqual(1, len(index))
        self.assertIn(PurePath('a'), index)
        self.assertEqual('3', index[PurePath('a')])

        del index[PurePath('a')]
        self.assertEqual(0, len(index))
        self.assertFalse(index.keys())
Ejemplo n.º 14
0
def store(k, terms, shlf_path):
    main_shlv = shelve.open(shlf_path, 'c')
    for w in terms:
        if w in main_shlv:
            index = main_shlv[w]
        else:
            index = Index()
        index.add_doc(int(k))
        main_shlv[w] = index
    main_shlv.close()
Ejemplo n.º 15
0
class Repository:
    def __init__(self):
        self.trees = {}
        self.commits = {}
        self.index = Index()
        self.branches = {}
        self.head = None

    def pack(self):
        return {
            'index': self.index.pack(),
            'trees': self.trees,
            'commits': self.commits,
            'branches': self.branches,
            'head': self.head,
        }

    def dump(self, filename):
        dump_object(filename, self.pack())

    def save(self, filename):
        save_object(filename, self.pack())

    def __str__(self):
        return json.dumps(self.pack(), ensure_ascii=False)

    def branch(self, name, ref):
        self.branches[name] = ref

    def write_tree(self):
        (tree_id, tree) = self.index.write_tree()
        self.trees = dict(self.trees, **tree)
        return tree_id

    def write_commit(self, message, author, tree_id, previous=[]):
        commit = {
            'tree_id': tree_id,
            'previous': previous,
            'message': message,
            'date': datetime.now().__str__(),
            'author': author
        }
        id = sha1(commit)
        self.commits[id] = commit
        return id

    def diff_branches(self, visitor, branch_a, branch_b):
        self.diff_commits(visitor, self.branches[branch_a],
                          self.branches[branch_b])

    def diff_commits(self, visitor, ref_commit_a, ref_commit_b):
        commit_a = self.commits[ref_commit_a]
        commit_b = self.commits[ref_commit_b]
        return diff_trees(visitor, commit_a['tree_id'], commit_b['tree_id'],
                          self.trees)
Ejemplo n.º 16
0
class Repository(object):
    '''
    The git repository
    '''

    GIT_DIR = '.git'

    INIT_DIR = [
        'branches',
        'hooks',
        'info',
        'objects',
        'objects/info',
        'objects/pack',
        'refs',
        'refs/heads',
        'refs/tags',
    ]

    INIT_FILE = [
        ['HEAD', 'ref: refs/heads/master'],
        ['description', 'Unnamed repository'],
        ['info/exclude', ''],
    ]

    def __init__(self, workspace):
        self.workspace = workspace
        self.index = Index(os.path.join(workspace, '.git', 'index'))
        self.config = Config(workspace)
        self.head_path = self._get_head_path()
        self.head_tree = None
        if os.path.exists(self.head_path):
            self.head_tree = read_file(self.head_path).strip()

    def _get_head_path(self):
        branch_name = read_file(os.path.join(self.workspace, '.git',
                                             'HEAD')).strip('\n').rsplit(
                                                 '/', 1)[-1]
        return os.path.join(self.workspace, '.git', 'refs', 'heads',
                            branch_name)

    def stage(self, files):
        try:
            for file in files:
                content = read_file(file)
                blob = Blob(self.workspace, content)
                if not os.path.exists(blob.path):
                    write_object_to_file(blob.path, blob.content)
                stat = os.stat(os.path.join(self.workspace, file))
                self.index.add_entry(file, ctime=stat.st_ctime, mtime=stat.st_mtime, dev=stat.st_dev, ino=stat.st_ino, mode=cal_mode(stat.st_mode), \
                       uid=stat.st_uid, gid=stat.st_gid, size=stat.st_size,sha1=blob.sha1, flags=0)
            self.index.write_to_file()

        except Exception, e:
            print 'stage file %s error: %s' % (file, e)
Ejemplo n.º 17
0
 def test_should_store_tokens_lowercase(self):
     index = Index()
     index.add_document('doc', 'This IS mY firsT DoCuMeNt')
     expected_tokens = set(['this', 'is', 'my', 'first', 'document'])
     expected_index = {'this': set(['doc']),
                       'is': set(['doc']),
                       'my': set(['doc']),
                       'first': set(['doc']),
                       'document': set(['doc']),}
     self.assertEquals(index.tokens(), expected_tokens)
     self.assertEquals(dict(index._index), expected_index)
Ejemplo n.º 18
0
    def test_pending_task_could_be_snoozed(self):
        today = datetime.datetime.today()
        today_10_min_future = today + datetime.timedelta(minutes=10)

        self.addTask(6, "task statring in 10 min", "PENDING", today_10_min_future.strftime('%Y-%m-%d %H:%M'), "NONE")
        overdue, startingsoon = self.index.listNotificationsPendingTasks()
        self.assertEqual(len(startingsoon), 1)
        self.index.snooze(6)
        index = Index()
        overdue, startingsoon = index.listNotificationsPendingTasks()
        self.assertEqual(len(startingsoon), 0)
def load_index(index_path, train_path, reconstruct=Setting.RERUN):
    print "load or construct index..."
    if not reconstruct and os.path.exists(index_path):
        index = load_data(index_path)
    else:
        index = Index()
        index.train_path = train_path
        index.construct_index()
        dump_data(index, index_path)
    print "done!"
    return index
Ejemplo n.º 20
0
 def test_mark_task_done(self):
     now = datetime.datetime.now()
     newTask = self.index.addTask("new task", now, "NONE")
     taskWithDate, taskWithoutDates = self.index.listAll()
     self.assertEqual(len(taskWithDate), 3)
     self.assertEqual(len(taskWithoutDates), 1)
     updatedTask = self.index.markTaskComplete(newTask.id)
     self.assertEqual(updatedTask.status, "DONE")
     index = Index()
     deltedTask = index.findTaskById(updatedTask.id)
     self.assertIsNone(deltedTask)
Ejemplo n.º 21
0
 def __init__(self):
     self.database = "hista"
     self.table = "bucket"
     self.primary_key = "id"
     self.meta = DBMeta()
     self.index = Index(self.database, self.table, self.primary_key)
     self.db_file = "hista.db"
     r = self._is_table_exists()
     if not r:
         print("table does not exist, create it now.")
         self.create_table()
Ejemplo n.º 22
0
 def test_mark_task_done(self):
     now = datetime.datetime.now()
     newTask = self.index.addTask("new task", now, "NONE")
     taskWithDate, taskWithoutDates = self.index.listAll()
     self.assertEqual(len(taskWithDate), 3)
     self.assertEqual(len(taskWithoutDates), 1)
     updatedTask = self.index.markTaskComplete(newTask.id)
     self.assertEqual(updatedTask.status, "DONE")
     index = Index()
     deltedTask = index.findTaskById(updatedTask.id)
     self.assertIsNone(deltedTask)
Ejemplo n.º 23
0
    def test_pending_task_could_be_snoozed(self):
        today = datetime.datetime.today()
        today_10_min_future = today + datetime.timedelta(minutes=10)

        self.addTask(6, "task statring in 10 min", "PENDING",
                     today_10_min_future.strftime('%Y-%m-%d %H:%M'), "NONE")
        overdue, startingsoon = self.index.listNotificationsPendingTasks()
        self.assertEqual(len(startingsoon), 1)
        self.index.snooze(6)
        index = Index()
        overdue, startingsoon = index.listNotificationsPendingTasks()
        self.assertEqual(len(startingsoon), 0)
Ejemplo n.º 24
0
class Repository(object):
    '''
    The git repository
    '''

    GIT_DIR = '.git'
    
    INIT_DIR = [
        'branches',
        'hooks',
        'info',
        'objects',
        'objects/info',
        'objects/pack',
        'refs',
        'refs/heads',
        'refs/tags',
    ]
    
    INIT_FILE = [
        ['HEAD', 'ref: refs/heads/master'],
        ['description', 'Unnamed repository'],
        ['info/exclude', ''],
    ]
    
    
    def __init__(self, workspace):
        self.workspace = workspace
        self.index = Index(os.path.join(workspace, '.git', 'index'))
        self.config = Config(workspace)
        self.head_path = self._get_head_path()
        self.head_tree = None
        if os.path.exists(self.head_path):
            self.head_tree = read_file(self.head_path).strip()
    
    def _get_head_path(self):
        branch_name = read_file(os.path.join(self.workspace, '.git', 'HEAD')).strip('\n').rsplit('/', 1)[-1]
        return os.path.join(self.workspace, '.git', 'refs', 'heads', branch_name)
        
    def stage(self, files):
        try:
            for file in files:
                content = read_file(file)
                blob = Blob(self.workspace, content)
                if not os.path.exists(blob.path):
                    write_object_to_file(blob.path, blob.content)
                stat = os.stat(os.path.join(self.workspace, file))
                self.index.add_entry(file, ctime=stat.st_ctime, mtime=stat.st_mtime, dev=stat.st_dev, ino=stat.st_ino, mode=cal_mode(stat.st_mode), \
                       uid=stat.st_uid, gid=stat.st_gid, size=stat.st_size,sha1=blob.sha1, flags=0)
            self.index.write_to_file()
                    
        except Exception, e:
            print 'stage file %s error: %s' % (file, e)
Ejemplo n.º 25
0
    def __init__(self, data=None, index=None, columns=None, dataframe=None):
        if dataframe is None:
            if not data is None:
                if isinstance(data, dict):
                    columns = data.keys()
                    dlist = []
                    n = 1
                    for v in data.values():
                        if isinstance(v, (list, tuple)):
                            n = len(v)
                            v = np.array(v)
                        elif isinstance(v, np.NDArray):
                            n = len(v)
                        dlist.append(v)
                    for i in range(len(dlist)):
                        d = dlist[i]
                        if not isinstance(d, np.NDArray):
                            d = [d] * n
                            d = np.array(d)
                            dlist[i] = d
                    data = dlist

                if isinstance(data, np.NDArray):
                    n = len(data)
                    data = data._array
                else:
                    dlist = []
                    n = len(data[0])
                    for dd in data:
                        dlist.append(dd._array)
                    data = dlist

                if index is None:
                    index = range(0, n)
                else:
                    if n != len(index):
                        raise ValueError('Wrong length of index!')

            if isinstance(index, np.NDArray):
                index = index.tolist()

            if isinstance(index, Index):
                self._index = index
            else:
                self._index = Index.factory(index)
            if data is None:
                self._dataframe = MIDataFrame(self._index._index)
            else:
                self._dataframe = MIDataFrame(data, self._index._index,
                                              columns)
        else:
            self._dataframe = dataframe
            self._index = Index.factory(index=self._dataframe.getIndex())
Ejemplo n.º 26
0
    def __init__(self, config, lang=None):
        """
        Constructor method
        
        @param config: configuration
        @param lang: language
        """

        self.config = config
        self.lang = lang
        self.subscribers = Subscribers(config)
        self.index = Index(self.config)
Ejemplo n.º 27
0
def call_index():
	global par_orb,par_color
	path= z.get()
	if path== '':
		tkMessageBox.showinfo('ERROR','Please folder path!!!')
	elif path != '':
		if par_orb==1:
			di=Data_index(path)
			di.insert_data()
		elif par_color==1:
			i=Index(path)
			i.main_fun()
Ejemplo n.º 28
0
 def __init__(self, data=None, index=None, columns=None, dataframe=None):                             
     if dataframe is None:
         if not data is None:
             if isinstance(data, dict):
                 columns = data.keys()
                 dlist = []
                 n = 1
                 for v in data.values():
                     if isinstance(v, (list, tuple)):
                         n = len(v)
                         v = minum.array(v)                    
                     elif isinstance(v, MIArray):
                         n = len(v)
                     dlist.append(v)
                 for i in range(len(dlist)):
                     d = dlist[i]
                     if not isinstance(d, MIArray):
                         d = [d] * n
                         d = minum.array(d)
                         dlist[i] = d
                 data = dlist
                 
             if isinstance(data, MIArray):
                 n = len(data)
                 data = data.array
             else:
                 dlist = []
                 n = len(data[0])
                 for dd in data:
                     dlist.append(dd.array)
                 data = dlist
                     
             if index is None:
                 index = range(0, n)
             else:
                 if n != len(index):
                     raise ValueError('Wrong length of index!')
                     
         if isinstance(index, (MIArray, DimArray)):
             index = index.tolist()
             
         if isinstance(index, Index):
             self._index = index
         else:
             self._index = Index.factory(index)
         if data is None:
             self._dataframe = MIDataFrame(self._index._index)
         else:
             self._dataframe = MIDataFrame(data, self._index._index, columns)
     else:
         self._dataframe = dataframe
         self._index = Index.factory(index=self._dataframe.getIndex())
Ejemplo n.º 29
0
def main(filepath, column):
  indexer = Index(NGRAM)
  f = codecs.open(filepath, "r", "utf-8")
  lines = f.readlines()

  for line in lines:
    print line
    elems = line.split("\t")
    indexer.append(''.join(elems[column-1]))

  f.close()
  indexer.dump("data/")
  return
Ejemplo n.º 30
0
    def test_index_in_memory(self):
        index = Index()
        self.assertEqual(0, len(index))
        self.assertFalse(index.keys())

        index[PurePath('a')] = '1'
        self.assertEqual(1, len(index))
        self.assertIn(PurePath('a'), index)
        self.assertEqual('1', index[PurePath('a')])

        del index[PurePath('a')]
        self.assertEqual(0, len(index))
        self.assertFalse(index.keys())
Ejemplo n.º 31
0
 def test_should_store_tokens_lowercase(self):
     index = Index()
     index.add_document('doc', 'This IS mY firsT DoCuMeNt')
     expected_tokens = set(['this', 'is', 'my', 'first', 'document'])
     expected_index = {
         'this': set(['doc']),
         'is': set(['doc']),
         'my': set(['doc']),
         'first': set(['doc']),
         'document': set(['doc']),
     }
     self.assertEquals(index.tokens(), expected_tokens)
     self.assertEquals(dict(index._index), expected_index)
Ejemplo n.º 32
0
class Repository(object):
    """
    The git repository
    """

    GIT_DIR = ".git"

    INIT_DIR = [
        "branches",
        "hooks",
        "info",
        "objects",
        "objects/info",
        "objects/pack",
        "refs",
        "refs/heads",
        "refs/tags",
    ]

    INIT_FILE = [["HEAD", "ref: refs/heads/master"], ["description", "Unnamed repository"], ["info/exclude", ""]]

    def __init__(self, workspace):
        self.workspace = workspace
        self.index = Index(os.path.join(workspace, ".git", "index"))
        self.config = Config(workspace)

    def stage(self, files):
        try:
            for file in files:
                content = read_file(file)
                blob = Blob(self.workspace, content)
                if not os.path.exists(blob.path):
                    write_object_to_file(blob.path, blob.content)
                stat = os.stat(os.path.join(self.workspace, file))
                self.index.add_entry(
                    file,
                    ctime=stat.st_ctime,
                    mtime=stat.st_mtime,
                    dev=stat.st_dev,
                    ino=stat.st_ino,
                    mode=cal_mode(stat.st_mode),
                    uid=stat.st_uid,
                    gid=stat.st_gid,
                    size=stat.st_size,
                    sha1=blob.sha1,
                    flags=0,
                )
            self.index.write_to_file()

        except Exception, e:
            print "stage file %s error: %s" % (file, e)
Ejemplo n.º 33
0
    def download_and_register_illustrations(self, illustrations):
        image_registration_arguments = []
        for illustration in illustrations:
            image_url = illustration.meta_single_page.get('original_image_url', illustration.image_urls.large)
            tags = [tag['name'] for tag in illustration.tags]

            url_basename = os.path.basename(image_url)
            extension = os.path.splitext(url_basename)[1]
            name = 'pixiv_{}{}'.format(str(illustration.id), extension)

            self.app_api.download(url=image_url, name=name)
            image_registration_arguments.append((name, tags))

        Index.get_or_create_instance().register_new_illustration_list(image_registration_arguments)
Ejemplo n.º 34
0
    def test_multipart(self):
        index = Index()
        index[PurePath('a/b')] = '2'
        self.assertEqual(1, len(index))
        self.assertIn(PurePath('a/b'), index)
        self.assertEqual([PurePath('a/b')], list(index))
        self.assertEqual('2', index[PurePath('a/b')])
        self.assertEqual(1, len(index[PurePath('a')]))
        self.assertIn(PurePath('b'), index[PurePath('a')])
        self.assertEqual('2', index[PurePath('a')][PurePath('b')])

        del index[PurePath('a/b')]
        self.assertEqual(0, len(index))
        self.assertFalse(index.keys())
Ejemplo n.º 35
0
def main() -> None:
    index = Index(BIGBOOK_DIR, SHOW_INDEX_FILE)

    templates = TemplateLookup(
        [TEMPLATE_DIR / 'website', TEMPLATE_DIR / 'website' / 'Jinja'],
        strict_undefined=True)

    # Create website directories, if necessary.
    for dirname in ('Text', 'Images', 'Media', 'Fonts', 'Styles'):
        (WEBSITE_DIR / dirname).mkdir(exist_ok=True, parents=True)

    # Copy in the styling files from the web template.
    for dirname in ('Fonts', 'Styles', 'Images'):
        for file in (TEMPLATE_DIR / 'common' / dirname).glob('*'):
            copyfile(src=file, dst=WEBSITE_DIR / dirname / file.name)
        for file in (TEMPLATE_DIR / 'website' / dirname).glob('*'):
            copyfile(src=file, dst=WEBSITE_DIR / dirname / file.name)

    # Copy in the Big Book's media files, if necessary.
    for dirname in ('Images', 'Media'):
        for file in (BIGBOOK_DIR / dirname).glob('*'):
            dst = WEBSITE_DIR / dirname / file.name
            if not dst.exists():
                copyfile(src=file, dst=dst)

    # Expand the 'index.html' file template.
    file = TEMPLATE_DIR / 'website' / 'index.html'
    template = templates.get_template(file.name)
    html_file = WEBSITE_DIR / file.name
    html_file.write_text(template.render(index=index))

    # Expand the index pages' templates.
    for file in (TEMPLATE_DIR / 'website' / 'Text').glob('index-*.html'):
        template = templates.get_template(file.name)
        html_file = WEBSITE_DIR / 'Text' / file.name
        html_file.write_text(template.render(index=index))

    # Expand the pages for the Big Book, using the page.html template.
    for article in index.articles():
        # content = xhtml.content(article.file, heading=True)
        # I'm going to be a barbarian instead and use a regex on HTML.
        # It's acceptably accurate on these Big Book files, and much faster.
        html = article.file.read_text()
        content = re.search(r'<body[^>]*>(.+)</body>', html, re.DOTALL)[1]
        content = content.replace('.xhtml"', '.html"')
        template = templates.get_template('page.html')
        destination = WEBSITE_DIR / 'Text' / (article.id + '.html')
        destination.write_text(
            template.render(content=content, article=article))
Ejemplo n.º 36
0
    def test_subindex(self):
        index = Index()
        subindex = Index()
        subindex[PurePath('c')] = '4'
        index[PurePath('d')] = subindex
        self.assertEqual(1, len(index))
        self.assertIn(PurePath('d/c'), index)
        self.assertEqual('4', index[PurePath('d/c')])
        self.assertEqual(1, len(index[PurePath('d')]))
        self.assertIn(PurePath('c'), index[PurePath('d')])
        self.assertEqual('4', index[PurePath('d')][PurePath('c')])

        del index[PurePath('d')]
        self.assertEqual(0, len(index))
        self.assertFalse(index.keys())
Ejemplo n.º 37
0
	def __init__(self,collections):

		if not os.path.exists('saved_library'): # will change to be name of the .avc file ok
			print("making new save data")
			self.library = Library()
			# for now
			self.library.init_from_xml("test.avc")

		else:
			self.load_library('saved_library',True)

		#self.savedata = open('saved_library','wb')	
		self.index = Index(self.library.get_clip_names())
		self.tag_index = Index(self.library.get_tags())
		self.col_index = Index(collections)
Ejemplo n.º 38
0
 def test_calling_method_load_should_retrieve_object_from_pickle_file(self):
     fp = NamedTemporaryFile(delete=False)
     fp.close()
     self.filename = fp.name
     index = Index()
     index.add_document('coffee', 'I liked it')
     index.add_document('water', 'I need it')
     index.dump(self.filename)
     retrieved_index = Index.load(self.filename)
     self.assertEquals(len(retrieved_index), 2)
     self.assertEquals(set(retrieved_index._index.keys()),
                       set(['i', 'liked', 'need', 'it']))
Ejemplo n.º 39
0
def build_index():
    """
    Builds and index from a given folder.

    Normalizes the documents, tokezine them, and create the index.

    This function is called only when the user has provided a wrong
    index file, or even when it did not provide anything at all.
    """

    processors.append(NormalizerProcessor())

    # Fetches every documents from the input folder
    print('[FETCHING]\tReading text files from \'{0}\'...'.format(folder))
    documents = Document.fetch(folder, True)

    # Normalizes every loaded documents
    print('[PROCESSING]\tNormalizing words from every documents...')
    tokenize_all(documents)

    # Creates the index by mapping every word
    # to all the documents that reference it
    print('[INDEXING]\tBuilding index from words...\n')
    posting_list = Posting.index(tokenized_documents)
    index = Index.build(posting_list)

    return index
Ejemplo n.º 40
0
    def test_put_schema__time_empty_should_fail(self):
        # arrange
        SUT = {"time": ""}

        # act
        with self.assertRaises(er.MultipleInvalid):
            Index.PUT_SCHEMA(SUT)
Ejemplo n.º 41
0
	def plot1(self, dat):
		with Index(dat,['canton']) as df_:
			df = df_.loc[self.abbreviation]
			df = df.reset_index()
			dates = df['date'].drop_duplicates()
			#dates_complete = complete_index(dates)
			dates_complete = dates
			df = df.set_index('date')
			released = df.loc[:,'ncumul_released'].values
			released[np.isnan(released)] = 0
			deceased = df.loc[:,'ncumul_deceased'].values
			deceased[np.isnan(deceased)] = 0
			df.loc[:,'current_hosp'] = df.loc[:,'current_hosp'].values + released  + deceased
			df.loc[:,'current_vent'] = df.loc[:,'current_vent'].values + released + deceased
			df.loc[:,'current_icu'] = df.loc[:,'current_icu'].values + deceased
			df = (df.rolling('7D').mean()).diff()
			#df = df.mean().diff()
			fig = plt.figure()
			ax = fig.subplots()
			ax.plot(df['ncumul_conf'],label='Neue Positive Testergebnisse')
			ax.plot(df['current_hosp'],label='Neue Hospitalisierungen')
			#ax.plot(df['current_icu'],label='Intensivstation')
			#ax.plot(df['ncumul_deceased'],label='Beatmet')
			ax.plot(df['ncumul_deceased'],label='Verstorben')
			plt.yscale('log')
			plt.grid(which='both')
			plt.legend()
			ax.set_title(self.canton + ' (Durchschnittswerte der letzten 7 Tage)')
			ax.set_ylabel('Durchschnittswert pro Tag')
			fig.autofmt_xdate()
		return ( fig )
Ejemplo n.º 42
0
def main():
    pp = pprint.PrettyPrinter()

    data_dir = 'data/easyCLEF08'
    txt_file = os.path.join(data_dir, 'easyCLEF08_text.txt')
    query_file = os.path.join(data_dir, 'easyCLEF08_query.txt')
    relevants_file = os.path.join(data_dir, 'easyCLEF08_gt.txt')

    index = Index(directory=data_dir, txt_file=txt_file, create_index=False)

    okapi = Okapi(index, k1=1.80, b=0.65)

    diversity = DiversityClustering(DBSCANClustering, index)

    q = QueryParser(relevants_file)
    q.q.initFile(query_file)

    # while True:
    for i in range(1):
        query = q.nextQuery()
        if query is None:
            break
        if len(query.relevants) == 0:
            print('No relevants docs')
            continue

        docs_scores = okapi.getRanking(query.getText())
        ordered_pred = diversity.order_pred(query, docs_scores)
Ejemplo n.º 43
0
    def test_put_schema_with_empty_pin_code_should_pass(self):
        # arrange
        SUT = {
            "id": 1,
            "enable": True,
            "pdpContext": {
                "static": True,
                "id": 1,
                "retryTimeout": 1200,
                "primary": {
                    "apn": "internet",
                    "type": "ipv4v6"
                },
                "secondary": {
                    "type": "ipv4v6"
                }
            },
            "pinCode": u"",
            "keepalive": {
                "enable": True,
                "targetHost": "8.8.8.8",
                "intervalSec": 60,
                "reboot": {
                    "enable": False,
                    "cycles": 1
                }
            }
        }

        # act
        data = Index.PUT_SCHEMA(SUT)

        # assert
        self.assertEqual(SUT, data)
Ejemplo n.º 44
0
    def __init__(self, comp=False, conf=False):
        '''
        Initiate the diagnosis finder.

        Parameters:
        comp - Recompiles the data files if set to True
        conf - Supply a Conf object
        '''
        if(conf):
            self.conf=conf
        else:
            self.conf=Conf()
        self.compiler=Compile(conf)
        if(comp):
            self.compiler.compile()
        self.index=Index(conf)
def optimize_language_parameters(measures=['AveragePrecision']):

    index = Index()
    models, names = [], []
    lissage_values = np.arange(0.1, 1.0, 0.1)

    for l in lissage_values:
        models.append(LanguageModel(index, l))
        names.append('LanguageModel_'+str(l))

    scores = compare_models(names, models, measures)
    ap = [scores[model_name]['AveragePrecision']['mean'] for model_name in names]
    ap_std = [scores[model_name]['AveragePrecision']['eval_std'] for model_name in names]

    fig = plt.figure(figsize=(10,8))
    plt.plot(lissage_values, ap)
    plt.title('Language model : average precision depending on lissage values')
    plt.xlabel('Lissage')
    plt.ylabel('Average Precision')
    plt.savefig('plot/Language_Model_ap.png')

    fig = plt.figure(figsize=(10,8))
    plt.plot(lissage_values, ap_std)
    plt.title('Language model : average precision std depending on lissage values')
    plt.xlabel('Lissage')
    plt.ylabel('Average Precision')
    plt.savefig('plot/Language_Model_ap_std.png')

    best_model = names[np.argmax(ap)]
    print('best_model', best_model)
Ejemplo n.º 46
0
def main():
    # Must have at least one application
    app = WebApplication()

    # Generate index.html from index.enaml
    with open('index.html', 'wb') as f:
        f.write(Index().render())
Ejemplo n.º 47
0
 def __init__(self,
              n_spiders=1,
              n_document_processors=1,
              seeds=None,
              indexable_content_types=None,
              do_not_crawl=[]):
     """
     n_spiders -- The number of spider processes to use
     n_document_processors -- The number of document processors to use
     seeds -- A list of initial URLs to crawl
     indexable_content_types -- A list of MIME content-types for which files should be indexed
     do_not_crawl -- A list of regular expressions for which matching domain names will not
                     be crawled
     """
     if not indexable_content_types: indexable_content_types = ['text/html']
     if not seeds: seeds = []
     self.frontier = Manager().Queue()
     self.document_store = Manager().Queue()
     self.visited_cache_path = 'visited_cache'
     self.visited_cache = None
     self.index = Index()
     self.n_spiders = n_spiders
     self.n_document_processors = n_document_processors
     self.indexable_content_types = indexable_content_types
     self.spiders = []
     self.document_processors = []
     self.seed_urls = seeds
     self.do_not_crawl = do_not_crawl
     for seed_url in seeds:
         self.frontier.put(seed_url)
     self.status = 'STOPPED'
Ejemplo n.º 48
0
def make_indexes(number_of_columns, prim_key_column, table):
    """
    Make a list of empty indices for the table on setup. The primary key column index must always exist so instantiate
    only this index to begin with.

    :param number_of_columns: int       # the number of columns in the table
    :param prim_key_column: int         # the index of the primary key column
    :param table: table object          # the table for which these indexes are being created

    :return: []                         # a list of None with an index in the primary key index index
    """
    indexes = [None] * number_of_columns
    index = Index()
    index.create_index(table=table, column_number=prim_key_column)
    indexes[prim_key_column] = index
    return indexes
Ejemplo n.º 49
0
 def setUp(self):
     root_path = os.path.dirname(os.path.realpath(__file__)) + "/../"
     try:
         os.unlink(root_path + "data/ntp.json")
         os.unlink(root_path + "data/ntp.json.backup")
     except:
         pass
     self.index = Index(connection=Mockup())
Ejemplo n.º 50
0
 def __init__(self, workspace):
     self.workspace = workspace
     self.index = Index(os.path.join(workspace, '.git', 'index'))
     self.config = Config(workspace)
     self.head_path = self._get_head_path()
     self.head_tree = None
     if os.path.exists(self.head_path):
         self.head_tree = read_file(self.head_path).strip()
Ejemplo n.º 51
0
    def setUp(self):
        self.root_path = os.path.abspath(os.path.dirname(__file__) + "/../")
        self.jsons = glob.glob(os.path.join(self.root_path, "data/*.json"))
        self.backups = glob.glob(os.path.join(self.root_path, "data/*.backup"))
        for file in self.jsons + self.backups:
            os.unlink(file)

        self.index = Index(connection=Mockup())
Ejemplo n.º 52
0
class Kbase:
    def init(self, *file):
        self.rules = []
        self.if_index = Index().init()
        self.then_index = Index().init()

        if file:
            self.load_rules(file[0])
        return self

    def remove_rule(self, id):
        for i in range(len(self.rules)):
            if self.rules[i]["rule"] == id:
                del self.rules[i]
                for if1 in self.rules[i]["if"]:
                    self.if_index.delete(if1)
                for then in self.rules[i]["then"]:
                    self.then_index.delete(then)

    def add_rule(self, rule):
        self.index_rule(rule)
        self.rules.append(rule)

    def index_rule(self, rule):
        for if1 in rule["if"]:
            self.if_index.store(if1, rule)  # fwd: fact/if index tree
        for then in rule["then"]:
            self.then_index.store(then, rule)  # bkwd: goal/then index tree

    def match_if(self, fact):
        return self.if_index.search(fact)

    def match_then(self, goal, dict):
        return self.then_index.search(goal, dict)

    def load_rules(self, name):
        try:
            file = open(strip(name), "r")
            contents = file.read()  # 'rule. rule.'
            rules = splitfields(contents, ".")  # ['rule','rule','']
            del rules[len(rules) - 1]  # ['rule','rule']
            for rule in rules:
                self.add_rule(internal_rule(rule))  # [{rule},{rule}]
            file.close()
        except IOError, cause:
            print "file error:", cause
Ejemplo n.º 53
0
 def __init__(self, tweets):
     # send tweets into index
     self.index = Index(tweets)
     
     # get documents from index
     self.documents = self.index.documents
     #print self.documents
     
     self.dVectors = [None]*len(self.documents)
     
     ''' we want a slightly easier way to deal with the documents, so we're translating them
     from what we had before (document objects) to dictionaries of words -> tfidf values
     so we don't have to recompute tfidf values again and again and again... '''
     for d in sorted(self.documents.keys()):
         doc = self.documents[d]
         vecs = {}
         for posting in doc.getPostingsList().values():
             vecs[posting.getTerm()] = self.getTFIDF(posting)
         
         
         ''' so now we have vectors of all words in the document... but maybe we just want the top x '''
         #vecs = self.reduceDimensionality(vecs)
         
         self.dVectors[d] = vecs
     
     ''' just for testing '''
     #self.dVectors = createDummyData()
     #self.documents = None        
     
     print self.cosineScore(self.dVectors[1],self.dVectors[1])
     
     ''' run kmeans for k = 2 '''
     statsFor2 = self.runKMeans(2)
     ''' run kmeans for k = 4 '''
     statsFor4 = self.runKMeans(4)
     ''' run kmeans for k = 6 '''
     statsFor6 = self.runKMeans(6)
     ''' run kmeans for k = 8 '''
     statsFor8 = self.runKMeans(8)
     
     ''' start printing stats '''
     print "-------------------------------------------------------------------"
     distanceFromOrigin = 0
     for document in self.dVectors:
         if self.distanceMetric == 'euclidean':
             distanceFromOrigin += self.distanceBetween({},document)
         elif self.distanceMetric == 'cosine':
             distanceFromOrigin += self.cosineScore({}, document)
         else:
             print 'you have not defined a distance metric'
         
     print 'STATISTICS REPORT FOR K = 1, WHERE THE CENTER IS THE ORIGIN:'
     print "  RSS:", distanceFromOrigin        
     
     self.printStats(statsFor2)        
     self.printStats(statsFor4)        
     self.printStats(statsFor6)        
     self.printStats(statsFor8)        
Ejemplo n.º 54
0
 def __init__(self, config, lang=None):
     """
     Constructor method
     """
     
     self.config = config
     self.lang = lang
     self.subscribers = Subscribers(config)
     self.index = Index(self.config)
Ejemplo n.º 55
0
 def index_doc_set(self, doc_set_name):
     path = os.path.join(self._root_directory, doc_set_name)
     if os.path.isdir(path):
         idx = Index.open_or_create(self._index_directory)
         doc_set = DocSet(self._index_directory)
         doc_set.add_doc_set(doc_set_name)
         idx.add_field(_DOC_SET_FIELD, TEXT(stored=True))
         idxr = _DocSetIndexer(idx, doc_set_name)
         idxr.index_directory(path)
Ejemplo n.º 56
0
 def test_calling_method_load_should_retrieve_object_from_pickle_file(self):
     fp = NamedTemporaryFile(delete=False)
     fp.close()
     self.filename = fp.name
     index = Index()
     index.add_document('coffee', 'I liked it')
     index.add_document('water', 'I need it')
     index.dump(self.filename)
     retrieved_index = Index.load(self.filename)
     self.assertEquals(len(retrieved_index), 2)
     self.assertEquals(set(retrieved_index._index.keys()),
                       set(['i', 'liked', 'need', 'it']))
Ejemplo n.º 57
0
def get_gradient(im, index, border_thickness_steps):
    """
    Fun. calc. radial gradient including thickness of cell edges
    @param im: image (for which grad. will be calc.)
    @param index: indices of pixes sorted by polar coordinates (alpha, radius)
    @param border_thickness_steps: number of steps to cop. grad. - depends on cell border thickness
    @return: gradient matrix for cell
    """
    # index of axis used to find max grad.
    max_gradient_along_axis = 2

    # preparing the image limits (called subimage) for which grad. will be computed
    radius_lengths, angles = index.shape[:2]

    # matrix init
    # for each single step for each border thick. separated grad. is being computed
    # at the end the max. grad values are returned (for all steps of thickness)
    border_thickness_steps = int(border_thickness_steps)
    gradients_for_steps = np.zeros((radius_lengths, angles, border_thickness_steps), dtype=np.float64)

    # for every step of thickness:
    for border_thickness_step in range(1, int(border_thickness_steps) + 1):
        # find beg. and end indices of input matrix for which the gradient will be computed
        matrix_end = radius_lengths - border_thickness_step
        matrix_start = border_thickness_step

        # find beg. and end indices of pix. for which the gradient will be computed
        starting_index = index[:matrix_end, :]
        ending_index = index[matrix_start:, :]

        # find internal in matrix where computed gradient will go
        intersect_start = int(math.ceil(border_thickness_step / 2.0))
        intersect_end = int(intersect_start + matrix_end)

        # comp. current gradient for selected (sub)image
        current_step_gradient = im[Index.to_numpy(ending_index)] - im[Index.to_numpy(starting_index)]
        current_step_gradient /= np.sqrt(border_thickness_step)

        # save gradient to previously determined place in results matrix
        gradients_for_steps[intersect_start:intersect_end, :, border_thickness_step - 1] = current_step_gradient

    return gradients_for_steps.max(axis=max_gradient_along_axis)
Ejemplo n.º 58
0
def test_index_file():
    dir = './test_index'
    idx = Index.create(dir)
    writer = idx.get_writer()
    writer.add_document(
        path = u'/foo/bar',
        title = u'Foo: The History',
        last_modified = 34343423423,
        text = u'Not much to say here')
    writer.commit()
    assert idx.doc_count() == 1
Ejemplo n.º 59
0
 def createIndex(self, name, col): 
     """ Create an index on the table.
     Arguments:
     name -- name of the index, used to identify it
     col -- name of column to be used as key for the index
     """
 
     index = Index.create(name, col)
     self.indexes.append(index)
     for row in self.rows:
         index.insert(row)