def __init__(self, device): super(PostagEmbedding, self).__init__(device=device) self.indexer = Indexer( special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=False) # postag embedding の場合だけ必ずFalse datasets = Dataset().get_instance() sentences = [ nltk.pos_tag(self.indexer.tokenize(pairs[0])) for pairs in datasets['train'] ] sentences = [[pairs[1] for pairs in sentence] for sentence in sentences] for sentence in sentences: self.indexer.add_sentence(sentence, with_raw=True) self.embedding_dim = 10 self.embedding = nn.Embedding(num_embeddings=len(self.indexer), embedding_dim=self.embedding_dim, padding_idx=self.indexer.padding_index) self.embedding.to(device)
def __init__(self): self.parameter = list() self.total = list() self.paramCount = 0 self.type = 'string' self.actionIndexer = Indexer(self.type) self.featureIndexer = Indexer(self.type)
def indexDirs(): click.echo("Indexing Files....") libraries = jhandler.getLibs() indxer = Indexer(libraries) nols = indxer.index() click.echo("{} library items detected!".format(nols)) click.echo("Done Indexing Files....")
def processFile(self): interpreter = Interpreter() # return all the documents present in the file output = self.path + '.bin' if isfile(output): print('loading tokens') self.index = pickle.load(open(output, 'rb')) self.indexer = Indexer(self.tokenizer, index=self.index) else: self.indexer = Indexer(self.tokenizer) file = open(self.path, 'r', encoding='utf-8', errors='ignore') maximum = os.stat(self.path).st_size # initialize the variables i = 0 progress = 0 document = [] for line in file: progress += len(line) if line == '\n': interpreter.process(self.indexer, document) document = [] else: document += [line] i += 1 if i >= 5000: i = 0 log(progress, maximum) file.close() self.index = self.indexer.index print('\nsaving tokens') pickle.dump(self.index, open(output, 'wb'))
class AbsolutePositionalEmbedding(AbstractEmbedding): def __init__(self, device): super(AbsolutePositionalEmbedding, self).__init__(device=device) self.max_length = 150 self.indexer = Indexer(special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=self.with_del_stopwords) self.indexer.add_sentence(list(map(str, range(self.max_length))), with_raw=True) self.embedding_dim = 20 self.embedding = nn.Embedding(num_embeddings=len(self.indexer), embedding_dim=self.embedding_dim, padding_idx=self.indexer.padding_index) self.embedding.to(device) def forward(self, sentences): sentences = [self.indexer.tokenize(sentence) for sentence in sentences] sentences = [[str(i) for i, _ in enumerate(sentence)] for sentence in sentences] indexes = [[self.indexer.get_index(word) for word in sentence] for sentence in sentences] pad_indexes = self.pad_sequence(indexes) pad_indexes = torch.Tensor(pad_indexes).long().to(self.device) vectors = self.embedding(pad_indexes) return vectors
def main(): parser = util.get_dates_range_parser() args = parser.parse_args() ind = Indexer(DB_DIR) #prof1, nouns1 = get_profiles(ind, args.start) #prof2, nouns2 = get_profiles(ind, args.end) cur = ind.get_db_for_date(args.start) prof, nouns = get_profiles(ind, args.start) replys_v = set() for p in prof: replys_v |= set(prof[p].replys.keys()) m = [] for p in prof: m_i = [] for r in replys_v: if r in prof[p].replys: m_i.append(prof[p].replys[r]) else: m_i.append(0) m.append(m_i) logging.info("%s x %s" % (len(m), len(m[0]))) u, s, v = numpy.linalg.svd(m, full_matrices=False) k = 50 uk = numpy.transpose(numpy.transpose(u)[:k]) sk = s[:k] stats.create_given_tables(cur, ["noun_similarity"]) cur.execute("create table if not exists noun_sim_svd as select * from noun_similarity limit 0") p_keys = prof.keys() sims = [] for i in range(0, len(p_keys)): for j in range(i + 1, len(p_keys)): p1_ = map(lambda x: u[i][x] * sk[x] , range(0, k)) p2_ = map(lambda x: u[j][x] * sk[x] , range(0, k)) sim = numpy.dot(p1_, p2_) / (numpy.linalg.norm(p1_) * numpy.linalg.norm(p2_)) sims.append((p_keys[i], p_keys[j], sim)) if len(sims) > 20000: save_sims(cur, sims) sims = [] logging.info("Another 10k seen") save_sims(cur, sims) logging.info("done")
def main(): # Indexer Initialization indexer = Indexer() indexer.build_dictionary() # indexer.write_dict_to_file() # classify files run_classifier(indexer)
def main(): start = time.time() indexer = Indexer(logging.DEBUG) word_dict = indexer.get_normalized_fequency() cosine = CosineScorer(word_dict, "the cat and the dog jkhgdh", logging.DEBUG) score = cosine.get_score("httpwwwvimncompressnickseriessam uSam jkhgdh ampcat", "http://www.google.com") print score print str(time.time() - start) + " seconds"
def __init__(self): self._max_url_length = 100 self._url_list = [] self._title_list = [] self._max_stay_on_site = 100 self._current_on_site = 0 self._previous_domain = None self._max_urls_in_list = 500 self._max_new_urls_per_page = 100 self._aggressive_pruning = True self._indexer = Indexer("localhost", 9200)
def __init__(self, numOfLayer): self.num = numOfLayer self.parent = [] self.children = [] self.handled = [] self.Indexer = Indexer() self.Processor = Processor() self.Porter = PorterStemmer() self.db = [] link = "http://www.cse.ust.hk/" self.parent.append(link)
def main(): parser = util.get_dates_range_parser() args = parser.parse_args() logging.info("Start") ind = Indexer(DB_DIR) grades = (1, 10, 100, 1000) data = [["date", "nouns", "tweets", "tweet_chains"] + map(lambda x: "cnt > %s" % x, grades)] print data dates = [] for date in sorted(ind.dates_dbs.keys()): if args.start is not None and date < args.start: continue if args.end is not None and date > args.end: continue cur = ind.get_db_for_date(date) tables = cur.execute( "SELECT name FROM sqlite_master WHERE type='table' and name = 'tweets_nouns'" ).fetchall() if len(tables) == 0: logging.error("No tweets_nouns for date %s" % date) continue stats.create_given_tables(cur, ["post_cnt"]) post_cnt = cur.execute("select count(*) from post_cnt").fetchone()[0] if post_cnt == 0: cur.execute( "insert or ignore into post_cnt select noun_md5, count(*) from tweets_nouns group by noun_md5" ) cnt = [date] nouns_cnt = cur.execute("select count(*) from nouns").fetchone()[0] cnt.append(nouns_cnt) tweets = cur.execute("select count(*) from tweets").fetchone()[0] cnt.append(tweets if tweets is not None else "~") tweet_chains = cur.execute( "select count(*) from tweet_chains").fetchone()[0] cnt.append(tweet_chains if tweet_chains is not None else "~") for i in grades: cnti = cur.execute( "select count(*) from (select 1 from post_cnt where post_cnt > %s group by post_md5)" % i).fetchone()[0] cnt.append("%.2f" % ((cnti + 0.0) / nouns_cnt)) data.append(cnt) for row in data: print_cols(row)
def load(self): self.indexer = Indexer(self.posting_path) if self.to_stem: self.indexer.to_stem = True self.languages = self.indexer.load() self.avg_doc_length = self.indexer.docs_avg_length self.searcher = Searcher(self.main_path, self.posting_path, self.indexer.terms_dict, self.indexer.cities_dict, self.indexer.docs_dict, self.avg_doc_length, self.to_stem, self.with_semantics) self.searcher.model = Word2Vec.load(self.posting_path + '//model.bin')
def start(self): self.indexer = Indexer(self.posting_path) if self.to_stem: self.indexer.to_stem = True dirs_list = os.listdir(self.main_path + '\\corpus') # Create temp postings Multiprocessing dirs_dict = ParallelMain.start(self.main_path, self.posting_path, self.to_stem, dirs_list) # Merging dictionaries that were created by the processes docs = {} files_names = [] post_files_lines = [] total_length = 0 for dir in dirs_dict.keys(): tmp_docs_dict = dirs_dict[dir][2] for doc_id in tmp_docs_dict: docs[doc_id] = tmp_docs_dict[doc_id] total_length += docs[doc_id].length for lang in dirs_dict[dir][3]: self.languages.add(lang) old_post_files_lines = dirs_dict[dir][0] for i in range(0, len(old_post_files_lines)): files_names.append(dir + "\\Posting" + str(i) if not self.to_stem else dir + "\\sPosting" + str(i)) post_files_lines.append(old_post_files_lines[i]) self.avg_doc_length = total_length / len(docs) # Gets Cities that appear in the corpus i = 0 while i < len(dirs_list): self.reader.read_cities(self.main_path + '\\corpus', dirs_list[i]) i += 1 terms_dicts = [ dirs_dict["\\Postings1"][1], dirs_dict["\\Postings2"][1], dirs_dict["\\Postings3"][1], dirs_dict["\\Postings4"][1] ] terms_dict = Merge.start_merge(files_names, post_files_lines, terms_dicts, self.posting_path, self.to_stem) self.indexer.docs_avg_length = self.avg_doc_length self.indexer.terms_dict = terms_dict self.indexer.docs_dict = docs self.indexer.index_cities(self.reader.cities) self.indexer.post_pointers(self.languages)
def __init__(self, data_path = ""): ''' Either read the Indexer info from files or generate it If readFromFiles is true: path = path to the dir containing the results from saving Indexer files before ELSE readFromFiles is false: path = path to the data dir''' self.createQueryDir() Indexer.__init__(self) index_timer_start = time.time() if len(data_path) == 0: # No need to calculate again self.read_files(self.indexer_path) # Indexer_Data print("Read index in {0} seconds".format(str(time.time() - index_timer_start))) else: self.handle_dir(data_path) self.create_tf_idf() print("Generated index in seconds: {0}".format(str(time.time() - index_timer_start))) self.save_indexer_to_files() self.inverse_doc_lookup = self.get_inverse_doc_lookup()
def __init__(self): DBCrawl.connect() DBUnCrawl.connect() DBRobot.connect() DBWebPage.connect() DBPageRank.connect() DBIndexer.connect() indexedCount.connect() #DBQuery.connect() self._getDBTables() self.indexer = Indexer() self.numberOfThreads = 1 self._setNumOfThreads() self.crawlerObjs = [] self._createCrawlerObjects()
class StanfordTwitterEmbedding(AbstractEmbedding): def __init__(self, device): super(StanfordTwitterEmbedding, self).__init__(device=device) self.path = Path( '../data/models/glove.twitter.27B/glove.twitter.27B.200d.txt') with_raw_file = False if with_raw_file: with self.path.open('r', encoding='utf-8-sig') as f: texts = f.readlines() headers = [len(texts), None] vocab, weights = map( list, zip(*Parallel(n_jobs=10) ([delayed(self.get_weights)(text) for text in texts]))) with (self.path.parent / 'vocab.pkl').open('wb') as f: pickle.dump(vocab, f) with (self.path.parent / 'weights.pkl').open('wb') as f: pickle.dump(weights, f) else: with (self.path.parent / 'vocab.pkl').open('rb') as f: vocab = pickle.load(f) with (self.path.parent / 'weights.pkl').open('rb') as f: weights = pickle.load(f) self.indexer = Indexer(special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=self.with_del_stopwords) for word in vocab: self.indexer.count_word(word) self.indexer.add_word(word) self.embedding_dim = len(weights[0]) special_weights = [[0.0] * self.embedding_dim] * 5 weights = torch.FloatTensor(special_weights + weights) self.embedding = nn.Embedding.from_pretrained( embeddings=weights, padding_idx=self.indexer.padding_index) self.embedding.to(device) def get_weights(self, text): content = text.split(' ') return content[0], list(map(float, content[1:]))
def __init__(self, device): super(AbsolutePositionalEmbedding, self).__init__(device=device) self.max_length = 150 self.indexer = Indexer(special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=self.with_del_stopwords) self.indexer.add_sentence(list(map(str, range(self.max_length))), with_raw=True) self.embedding_dim = 20 self.embedding = nn.Embedding(num_embeddings=len(self.indexer), embedding_dim=self.embedding_dim, padding_idx=self.indexer.padding_index) self.embedding.to(device)
def __init__(self, device): super(NtuaTwitterEmbedding, self).__init__(device=device) self.path = Path('../data/models/ntua-slp-semeval2018/ntua_twitter_300.txt') with self.path.open('r', encoding='utf-8-sig') as f: texts = f.readlines() headers = texts[0].strip().split(' ') contents = [text.strip().split(' ') for text in texts[1:]] vocab = [content[0] for content in contents] weights = [list(map(float, content[1:])) for content in contents] self.indexer = Indexer(special_tokens={'<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4}, with_del_stopwords=self.with_del_stopwords) for word in vocab: self.indexer.count_word(word) self.indexer.add_word(word) self.embedding_dim = int(headers[1]) special_weights = [[0.0] * self.embedding_dim] * 5 weights = torch.FloatTensor(special_weights + weights) self.embedding = nn.Embedding.from_pretrained(embeddings=weights, padding_idx=self.indexer.padding_index) self.embedding.to(device)
def __init__(self, data_path=""): ''' Either read the Indexer info from files or generate it If readFromFiles is true: path = path to the dir containing the results from saving Indexer files before ELSE readFromFiles is false: path = path to the data dir''' self.createQueryDir() Indexer.__init__(self) index_timer_start = time.time() if len(data_path) == 0: # No need to calculate again self.read_files(self.indexer_path) # Indexer_Data print("Read index in {0} seconds".format( str(time.time() - index_timer_start))) else: self.handle_dir(data_path) self.create_tf_idf() print("Generated index in seconds: {0}".format( str(time.time() - index_timer_start))) self.save_indexer_to_files() self.inverse_doc_lookup = self.get_inverse_doc_lookup()
def __init__(self, device): super(RawEmbedding, self).__init__(device=device) self.indexer = Indexer(special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=self.with_del_stopwords) datasets = Dataset().get_instance() sentences = [pairs[0] for pairs in datasets['train']] self.indexer.count_word_in_text(sentences) self.indexer.add_sentences(sentences) self.embedding_dim = 100 self.embedding = nn.Embedding(num_embeddings=len(self.indexer), embedding_dim=self.embedding_dim, padding_idx=self.indexer.padding_index) self.embedding.to(device)
def do_query(query, idf_t): q_terms = set(ind.parse_html(query)) db = sqlite3.connect("data/pages.db") cursor = db.cursor() page_ids = set() for term in q_terms: db_q = """SELECT indexTable.pageId FROM indexTable, freqTable WHERE freqTable.id = indexTable.termID AND freqTable.term = "{0}" """.format(term) cursor.execute(db_q) tmp_page_ids = set() if len(page_ids) == 0: for page_id in cursor.fetchall(): page_ids.add(page_id[0]) else: for page_id in cursor.fetchall(): tmp_page_ids.add(page_id[0]) page_ids = page_ids.intersection(tmp_page_ids) import time page_score = [] i = 0.0 for page_id in page_ids: t = time.time() db_q = """SELECT html FROM htmlParsed WHERE pageId = ?""" cursor.execute(db_q, (page_id,)) times[0].append(time.time() - t) doc = str(cursor.fetchall()[0][0]).split() # ind.parse_html(str(cursor.fetchall()[0][0])) times[1].append(time.time() - t) page_score.append((page_id, compare_page_query(doc, ind.parse_html(query), idf_t))) times[2].append(time.time() - t) print('{1:.2%} {0}'.format(page_id, 1.0/len(page_ids)*i), end='\r') i += 1 db.close() return page_score
def interact(c): # Create a indexer object indexer = Indexer(root) while True: # Get Directory details curr_directory, folders, files = indexer.get_dir_details() print "Current Directory :", curr_directory # Send the details to the client c.send(curr_directory + "\n") for item in folders: c.send(item + '\n') c.send("\n") for item in files: c.send(item[0] + '\n' + str(item[1]) + "\n") c.send("/") # Receive response from client choice = c.recv(1024) # Is -1 disconnect if choice == "-1": print "Disconnecting from Client" return elif int(choice) <= len( folders): # Change directory if folder is selected indexer.make_choice(int(choice)) else: send_file(c, indexer.get_file_path( int(choice))) #Send file if file is selected return
def start_indexing(dirs_list, dirs_dicts, main_path, posting_path, to_stem, start_index, end_index, directory): dirs_dicts[directory] = None reader = ReadFile() parser = Parse(main_path) indexer = Indexer(posting_path + directory) if to_stem: parser.to_stem = True indexer.to_stem = True if not os.path.exists(posting_path + directory): os.makedirs(posting_path + directory) documents = {} i = start_index while i < end_index: docs = reader.separate_docs_in_file(main_path + '\\corpus', dirs_list[i]) j = 0 for doc_id in docs: doc_dict = parser.main_parser(docs[doc_id].text, docs[doc_id]) docs[doc_id].text = None if i == end_index - 1 and j == len(docs) - 1: indexer.finished_parse = True indexer.index_terms(doc_dict, doc_id) documents[doc_id] = docs[doc_id] j += 1 i += 1 dirs_dicts[directory] = [ indexer.post_files_lines, indexer.terms_dict, documents, reader.languages ]
def crawl(current_url): print('Total in Queue', len(Crawler.queue), '| Total Crawled', len(Crawler.crawled)) if '.vhd' not in current_url: try: with urllib.request.urlopen(current_url) as response: html = response.read() soup = BeautifulSoup(html, "html.parser") print(" crawling", current_url) for link in soup.findAll('a', attrs={'href': re.compile("^http")}): href = link.get('href') if href not in Crawler.queue and href not in Crawler.crawled: Crawler.queue.add(href) Crawler.crawled.add(current_url) Crawler.queue.discard(current_url) Indexer.indexer(current_url, soup) Crawler.save_lists() except: print("ERROR", current_url) Crawler.queue.discard(current_url) Crawler.save_lists() pass
def init(): bot_id = '1437569240:AAEd2sZ0faC1EwPvQGJPPW4xf7ohP1hTzV8' updater = Updater(bot_id) updater.setPhotoHandler(imageHandler) QualityChecker.init() ShoeDetector.init() FeatureExtractor.init() data_structure = Indexer.build_data_structure(config.DATASET_PATH) Matcher.init(data_structure) print("Bot is running...") updater.start()
def imageHandler(bot, message, chat_id, local_filename): bot.sendMessage(chat_id, "Hi, I'm processing your request") print("Processing request...") is_good_quality = QualityChecker.is_good_quality( Indexer.load_image(local_filename, im_size=config.QUALITYCHECKER_IMSIZE)) if not is_good_quality: bot.sendMessage( chat_id, "Your image is of a poor quality. Please, send me a better one") print("Message sent: image is of a poor quality.") else: is_shoe = ShoeDetector.classify_image( Indexer.load_image(local_filename, im_size=config.CLASSIFIER_IM_SIZE)) if not is_shoe: bot.sendMessage( chat_id, "Ops! Something went wrong... Make sure your image contains a shoe" ) print("Message sent: the photo doesn't contain a shoe.") else: try: most_similar = Matcher.get_most_similar( Indexer.load_image(local_filename)) retrieved_images = Matcher.retrieve_items(most_similar) bot.sendMessage(chat_id, "These are the most similar shoes I've found") for im in retrieved_images: bot.sendImage(chat_id, config.DATASET_PATH + im, "") print("Most similar images sent.") except FeatureExtractionException: bot.sendMessage( chat_id, "I couldn't process your photo. Please, send me a better one" ) print("Message sent: the photo can't be processed.") print("Request processed.")
def main(): indexer = Indexer() numDocs = 0 for subdir, dirs, files in os.walk( r'C:\Users\Justin Ho\Documents\CS 121\developer\DEV'): for filename in files: filepath = subdir + os.sep + filename f = open(filepath) data = json.load(f) print(data["url"]) indexer.parse(data['content'], data['url']) numDocs += 1 sortedTokens = sorted(indexer.invertedIndex.items(), key=lambda x: x[1]["tf-idf"], reverse=True) print("Number of Documents: {}".format(numDocs)) print("Number of Unique Tokens: {}".format( len(indexer.invertedIndex.keys()))) file1 = open("index.txt", "a") for k, v in sortedTokens: file1.write("{}:{}\n".format(k, v)) file1.close()
class PostagEmbedding(AbstractEmbedding): def __init__(self, device): super(PostagEmbedding, self).__init__(device=device) self.indexer = Indexer( special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=False) # postag embedding の場合だけ必ずFalse datasets = Dataset().get_instance() sentences = [ nltk.pos_tag(self.indexer.tokenize(pairs[0])) for pairs in datasets['train'] ] sentences = [[pairs[1] for pairs in sentence] for sentence in sentences] for sentence in sentences: self.indexer.add_sentence(sentence, with_raw=True) self.embedding_dim = 10 self.embedding = nn.Embedding(num_embeddings=len(self.indexer), embedding_dim=self.embedding_dim, padding_idx=self.indexer.padding_index) self.embedding.to(device) def forward(self, sentences): if self.with_del_stopwords: postags = [ nltk.pos_tag(self.indexer.tokenize(sentence)) for sentence in sentences ] sentences = [[pairs[0] for pairs in postag] for postag in postags] postags = [[pairs[1] for pairs in postag] for postag in postags] is_stopword = self.indexer.is_stopword(sentences) postags = [[tag for sw, tag in zip(stopword, postag) if sw != 1] for stopword, postag in zip(is_stopword, postags)] else: postags = [ nltk.pos_tag(self.indexer.tokenize(sentence)) for sentence in sentences ] postags = [[pairs[1] for pairs in postag] for postag in postags] indexes = [[self.indexer.get_index(tag) for tag in postag] for postag in postags] pad_indexes = self.pad_sequence(indexes) pad_indexes = torch.Tensor(pad_indexes).long().to(self.device) vectors = self.embedding(pad_indexes) return vectors
def probe_vocabs(): datasets, tags = get_datasets() indexer = Indexer(with_preprocess=False) n_grams = [1, 2, 3] raw_texts = datasets multi_stats = {i: { 'vocabs': {tag: set() for tag in tags}, 'counts': {tag: {} for tag in tags}, 'vocabs_by_labels': {tag: {'INFORMATIVE': set(), 'UNINFORMATIVE': set()} for tag in tags}, 'counts_by_labels': {tag: {'INFORMATIVE': {}, 'UNINFORMATIVE': {}} for tag in tags}, 'ann_texts': {tag: [] for tag in tags}, 'del_texts': {tag: [] for tag in tags} } for i in n_grams} del_items = set( ['<hashtag>', '</hashtag>', '<allcaps>', '</allcaps>', '<user>', 'covid19', 'coronavirus', 'covid', '<number>', 'httpurl', 19, '19']) del_items |= set(["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", "<", ">", "(", ")", "/"]) del_items |= set(nltk_stopwords.words('english')) for n_gram in n_grams: for tag in tags: for text, label in datasets[tag]: words = indexer.text_processor.pre_process_doc(text) label = get_label_text(label) multi_stats[n_gram]['ann_texts'][tag].extend([['_'.join(words[i: i+n_gram]) for i in range(0, len(words) - n_gram + 1)]]) del_words = [word for word in words if word not in del_items] multi_stats[n_gram]['del_texts'][tag].extend([['_'.join(del_words[i: i+n_gram]) for i in range(0, len(del_words) - n_gram + 1)]]) if n_gram != 0: words = del_words for word in ['_'.join(words[i: i+n_gram]) for i in range(0, len(words) - n_gram + 1)]: multi_stats[n_gram]['vocabs'][tag].add(word) multi_stats[n_gram]['vocabs_by_labels'][tag][label].add(word) if word in multi_stats[n_gram]['counts'][tag].keys(): multi_stats[n_gram]['counts'][tag][word] += 1 else: multi_stats[n_gram]['counts'][tag][word] = 1 if word in multi_stats[n_gram]['counts_by_labels'][tag][label].keys(): multi_stats[n_gram]['counts_by_labels'][tag][label][word] += 1 else: multi_stats[n_gram]['counts_by_labels'][tag][label][word] = 1 return {'multi_stats': multi_stats, 'raw_texts': raw_texts}
def retrieve_items(sorted_rsv, n=3): """ Given the sorted list of similar images, retrieve the n most similar images belonging to different classes :param sorted_rsv: a sorted np array. The first column contains scores, the second one contains image names :param n: the number of images to retrieve :return: list of retrieved image names """ images = sorted_rsv[:, 1] image_classes_dict = Indexer.extract_classes() classes_already_retrieved = [] retrieved = [] for image in images: class_id = image_classes_dict[image] if class_id not in classes_already_retrieved: retrieved.append(image) classes_already_retrieved.append(class_id) if len(retrieved) == n: break return retrieved
def probe_sentence_length(): datasets, tags = get_datasets() counts = {tag: {} for tag in tags} counts_by_labels = {tag: {'INFORMATIVE': {}, 'UNINFORMATIVE': {}} for tag in tags} indexer = Indexer() for tag in tags: for text, label in datasets[tag]: words = indexer.text_processor.pre_process_doc(text) label = get_label_text(label) if len(words) in counts[tag].keys(): counts[tag][len(words)].append(words) else: counts[tag][len(words)] = [words] if len(words) in counts_by_labels[tag][label].keys(): counts_by_labels[tag][label][len(words)] += 1 else: counts_by_labels[tag][label][len(words)] = 1 return {'counts': counts, 'counts_by_labels': counts_by_labels}
def _build_index(self): """ Takes the extracted terms and stop words and builds up the term frequency index and the document frequency index. """ print "Building index ..." for website_and_terms in self._extracted_terms: website = website_and_terms[0] terms = website_and_terms[1] self._document_lengths[website] = len(terms) self._indexer = Indexer(self._extracted_terms, self._stopwords) index = self._indexer.buidlindex() self._document_frequency = index[0] self._term_frequency = index[1] self._extracted_terms = index[2] self._document_lengths = index[3] #pprint(self._term_frequency) #print " Document index:" #for term in sorted(self._document_frequency): # print " - " + term + ": " + str(self._document_frequency[term]) + " times" print print " Term frequency:" for term in sorted(self._term_frequency): print " - " + term + ":" print " - Document Frequency: "+ str(self._document_frequency[term]) for document_and_count in self._term_frequency[term]: print " - " + document_and_count[0] + ": " + str(document_and_count[1]) + " times" print print " Document Lengthes" pprint (self._document_lengths) print "Index build up."
def main(argv): collectionFile = '' tokenizerType = '' try: opts, args = getopt.getopt(argv, "hf:t:", ["collectionFile=", "tokenizerType="]) except getopt.GetoptError: print( 'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>' ) sys.exit() if len(opts) != 2: print( 'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>' ) sys.exit() for opt, arg in opts: if opt == '-h': print( 'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>' ) sys.exit() elif opt in ("-f", "--collectionFile"): if not path.exists(arg): print('Incorrect path to collection file.') sys.exit() elif not path.exists(arg): print('File doesn\'t exists') sys.exit() collectionFile = arg elif opt in ("-t", "--tokenizerType"): if arg != '0' and arg != '1': print( 'Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.' ) sys.exit() tokenizerType = arg indexer = Indexer(collectionFile, tokenizerType) indexer.listTermsInOneDoc() indexer.listHighestDocFreqTerms()
def main(inputDir, outputDir): m = Indexer() files = os.listdir(inputDir) #for measuring elapsed time elapsed_time = [] start = time.time() #Loop over all files in the given directory for file in files: if file.endswith(".html"): inputFile = os.path.join(inputDir, file) m.parse(inputFile) #the tokenization happens inside this function m.mappings.append((m.doc_num, inputFile)) end = time.time() elapsed_time.append(end - start) m.writeFiles(outputDir, N=len(m.mappings)) print("Ran in {} seconds.".format(elapsed_time[-1]))
class SearchEngine(object): """ Manages crawler, indexer and page rank computer and provides a convenient method for querying the search index. The class is initialized with a set of seed URLs for the crawler and a list of stop words for the indexer. It will then run the crawler, indexer and page rank computer and provides a ready- to-use search index which can be queried using query(). """ # Words that are filtered out before term/document frequency calculation. _stopwords = [] # Seed URLs for the web crawler, these are the URLs which the web crawler uses to start # crawling. _seed_urls = [] # The web crawler, starts with the seed URLs and build up the web graph and extracts terms from # the web sites. _crawler = None # Computes the page ranks for every website using the web graph. _page_rank_computer = None # The indexer that creates the term frequency and document frequency indexes. _indexer = None # The webgraph is a dictionary mapping websites to a list of outlinks (websites linked to by # that website). _webgraph = {} # A dictionary mapping websites to extracted terms. _extracted_terms = {} # A dictionary mapping websites to computed page ranks. _page_ranks = {} # A dictionary mapping terms to the number of documents they occur in. _document_frequency = {} # A dictionary mapping terms to a list of tuples of documents and the number of times the term # occurs in that document, e.g. { 'term' : [('document', 123), ('anotherdocument', 1234)] } _term_frequency = {} # A dictionary mapping websites to the length of their content (number of words). _document_lengths = {} def __init__(self, seed_urls, stopwords = None): """ Initializes the search engine with the given seed urls and the given stop words and does the crawling, computes page ranks and builds up the index. Args: seed_urls: The seed urls for the crawler. stopwords: The stop words for the indexer. """ if stopwords is not None: self._stopwords = stopwords self._seed_urls = seed_urls self._do_crawling() self._compute_page_ranks() self._build_index() def _sanitize_query(self, query): """ Sanitizes the query by lowercasing all characters, then creates a dictionary mapping query terms to occurrence count. Args: query: The search query, terms separated by whitespace. Returns: A dictionary mapping query terms to occurrence count. """ query_terms = query.lower().split() terms = {} for term in query_terms: if term in terms: terms[term] = terms[term] + 1 else: terms[term] = 1 return terms def query(self, query): """ Searches the index for every term (separated by whitespace), then sorts the resulting documents by relevance using the cosine score algorithm and prints them. Args: query: The search query, terms separated by whitespace (all terms will be converted to lowercase). """ terms = self._sanitize_query(query) if not terms: print "No search terms entered." return documents = [] scores = defaultdict(int) querryLength = 0 for term in terms: if not term in self._term_frequency: continue # The inverse document frequency weight is a measure of informativeness of a term and # is calculated by dividing the number of documents in the webgraph by the number of # documents the term occurs in. # # idf = log10(number of documents in webgraph/number of documents containing term) idf = log10(len(self._document_frequency) / self._document_frequency[term]) # The weight of a term in the query is the product of the term frequency weight and the # inverse document frequency weight. # # tqw = (1 + log10(term frequency in the query)) * idf term_query_weight = (1 + log10(terms[term])) * idf; querryLength = querryLength + pow(term_query_weight, 2) for term in terms: documents_containing_term = [] if not term in self._term_frequency: continue # The inverse document frequency weight is a measure of informativeness of a term and # is calculated by dividing the number of documents in the webgraph by the number of # documents the term occurs in. # # idf = log10(number of documents in webgraph/number of documents containing term) idf = log10(len(self._document_frequency) / self._document_frequency[term]) # The weight of a term in the query is the product of the term frequency weight and the # inverse document frequency weight. # # tqw = (1 + log10(term frequency in the query)) * idf term_query_weight = (1 + log10(terms[term])) * idf; term_document_weights = {} for document_and_count in self._term_frequency[term]: document, count,tfidf = document_and_count documents_containing_term.append(document) # The weight of a term in the document is the product of the weighted term frequency # and the inverse document frequency weight. # # tdw = (1 + log10(frequency of the term in the document)) * idf term_document_weights[document] = tfidf # Merge documents containing the term with the result list. documents = list(set(documents + documents_containing_term)) # Add the product of the term query weight and the term document weight to each # document. for document in documents_containing_term: score = scores[document] + (term_query_weight * term_document_weights[document]) scores[document] = score # Divide the score of each document d by the length of document d, so that longer and # shorter documents have scores in the same order of magnitude. for doc in scores: scores[doc] = scores[doc] / (self._document_lengths[doc] *sqrt(querryLength)) print if not documents: print ("No documents match your search terms (\"" "" + ', '.join(str(term) for term in terms) + "\").") return print "Results:" for document in sorted(documents, key = lambda url : self._page_ranks[url] * scores[url], reverse = True): print " - " + document print (" (Score: " + str(scores[document]) + "" ", PageRank: " + str(self._page_ranks[document]) + "" ", Combined: " + str(self._page_ranks[document] * scores[document]) + ")") def _do_crawling(self): """ Initializes the crawler with the seed urls and starts crawling, then stores the resulting webgraph and the extracted terms in the attributes. Also counts the extracted words in every website and stores each website's length in the document_lengths attribute. """ print "Starting crawler ..." print " Seed URLs: " for url in self._seed_urls: print " - " + url self._crawler = Crawler(self._seed_urls) results = self._crawler.startCrawling() self._webgraph = results[0] self._extracted_terms = results[1] print " Web graph: " for url in self._webgraph.keys(): print " - " + url for outlink in self._webgraph[url]: print " -> " + outlink #print " Extracted terms: " #for website in self._extracted_terms: # print " - " + website[0] + ": " # print ', '.join(str(token) for token in website[1]) print "Crawler finished." print def _compute_page_ranks(self): """ Initializes the page rank computer with the webgraph and computes the page ranks. """ print "Computing page ranks ..." self._page_rank_computer = Computer(self._webgraph) self._page_rank_computer.dampening_factor = 0.95 self._page_rank_computer.compute() self._page_ranks = self._page_rank_computer.page_ranks print " Page ranks:" result_sum = 0 for website in sorted(self._page_ranks.keys()): result_sum += self._page_ranks[website] print " - " + website + ": " + str(self._page_ranks[website]) #print #print " Sum: " + str(result_sum) print "Page ranks computed." print def _build_index(self): """ Takes the extracted terms and stop words and builds up the term frequency index and the document frequency index. """ print "Building index ..." for website_and_terms in self._extracted_terms: website = website_and_terms[0] terms = website_and_terms[1] self._document_lengths[website] = len(terms) self._indexer = Indexer(self._extracted_terms, self._stopwords) index = self._indexer.buidlindex() self._document_frequency = index[0] self._term_frequency = index[1] self._extracted_terms = index[2] self._document_lengths = index[3] #pprint(self._term_frequency) #print " Document index:" #for term in sorted(self._document_frequency): # print " - " + term + ": " + str(self._document_frequency[term]) + " times" print print " Term frequency:" for term in sorted(self._term_frequency): print " - " + term + ":" print " - Document Frequency: "+ str(self._document_frequency[term]) for document_and_count in self._term_frequency[term]: print " - " + document_and_count[0] + ": " + str(document_and_count[1]) + " times" print print " Document Lengthes" pprint (self._document_lengths) print "Index build up."
def __init__(self, seed_URL): self.seed_URL = seed_URL self.indexer = Indexer("people", "person") self.__walk()
from Analyzer import Analyzer from java.io import StringReader from org.apache.lucene.analysis import TokenStream from org.apache.lucene.index import IndexWriterConfig from org.apache.lucene.util import Version from org.apache.lucene.search import Explanation lucene.initVM() analyzer = Analyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) indexer = Indexer(config, '/home/hnguyen/Projects/CLIFinder/cli.index') indexer.index('/home/hnguyen/Projects/CLIFinder/cli') searcher = Searcher(analyzer, '/home/hnguyen/Projects/CLIFinder/cli.index') while True: strQuery = raw_input("Query:") if strQuery == '': sys.exit(1) docs, query = searcher.search(strQuery, 'content', 'name') print '"%s" has %s result(s)' % (strQuery, len(docs)) for d in docs: print 'Score: %s \nFile: %s \nDesc: %s \n' % (d.score, searcher.mIndexSearcher.doc(d.doc).get('name'), searcher.mIndexSearcher.doc(d.doc).get('content'))
def __init__(self,root): self.store = gtk.ListStore(str, str, gtk.gdk.Pixbuf, str) self.root = os.path.abspath(root) self.index = Indexer(self.root)
class Crawler(object): __resourcesQueue = set() def __init__(self, seed_URL): self.seed_URL = seed_URL self.indexer = Indexer("people", "person") self.__walk() def __walk(self): #Extract the resources from the seed URL self.__resourcesQueue |= self.__extractResources(self.seed_URL) #Extract the people from the seed URL self.__extractPeople(self.seed_URL) while ((len(self.__resourcesQueue) != 0)): resource_url = self.__resourcesQueue.pop() # Find the linked resources from this resource self.__resourcesQueue |= self.__extractResources(resource_url) self.__extractPeople(resource_url) def __extractResources(self, resource_url): print "Looking for resources in %s" % resource_url resources_query = """ PREFIX foaf:<http://xmlns.com/foaf/0.1/> PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> SELECT ?resource WHERE { <%s> ?p ?resource }""" % resource_url resources = set(self.__run_query(resources_query, 'Resources')) print "Found %s resources" % len(resources) return resources def __extractPeople(self, resource_url): print "Looking for people in %s" %resource_url people_query = """ PREFIX foaf:<http://xmlns.com/foaf/0.1/> PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> SELECT ?person, ?label, ?mentioned_by WHERE { <%s> ?p ?person . ?person rdf:type foaf:Person . ?person rdfs:label ?label . ?mentioned_by ?pred ?person }""" % resource_url people = self.__run_query(people_query, 'People', resource_url) print "Found %s people" % len(people) # Index the returned people object if (len(people) != 0): for person in people: # Remove duplicate entries in the mentioned_by field people[person]['mentioned_by'] = list(people[person]['mentioned_by']) self.indexer.index(person, people[person]) print "Done indexing batch" def __run_query(self, query, type, resource_url=None): query_result_list = [] people_json = {} people_found = set() try: # Set the SPARQL endpoint to run queries against sparql = SPARQLWrapper("http://dbpedia.org/sparql") sparql.setQuery(query) sparql.setReturnFormat(JSON) results = sparql.query().convert() if (type == 'Resources'): for result in results['results']['bindings']: # Only follow resources that are URIs if (result['resource']['type'] == 'uri'): query_result_list.append(result['resource']['value']) return query_result_list if (type == 'People'): for result in results['results']['bindings']: people_found.add(result['person']['value']) # For each distinct person found, create an object for them for person in people_found: people_json[person] = { 'uri': person, 'mentioned_by': set() } for result in results['results']['bindings']: # For each result, get the label and append to the mentioned_by array person_uri = result['person']['value'] person_label = result['label']['value'] label_lang = result['label']['xml:lang'] person_mentioned_by = result['mentioned_by']['value'] if (label_lang == 'en'): people_json[person_uri]['label'] = person_label people_json[person_uri]['mentioned_by'].add(person_mentioned_by) return people_json except Exception: if (type == 'Resources'): return query_result_list elif (type == 'People'): return people_json
for i in postings1: for k in postings2: if i == k: intersect.append(i) """ """ intersect = set(postings[0]).intersection(*postings) """ return intersect x = Indexer() lines = file('tweets.txt').read().split('\n') for r in range(1): for i in range(1000): x.process(docId = i+r*10000, text = lines[i]) # #for i in x.data.items(): # print i #shelf = shelve.open('dump.txt') #shelf['indexes'] = x.data #f = file('dump.txt', 'w') #for i in x.data.items(): # f.write(str(i) + '\n')
from Frontier import Frontier from PageRanker import PageRanker from Indexer import Indexer from Searcher import Searcher import re frontier = Frontier() pageRanker = PageRanker() indexer = Indexer() seedDocuments = [ 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html' ] def printWebGraph(webGraph): print print '-*( Web Graph )*-' print for entry in sorted(webGraph.keys()): print entry + ' -> ' + ', '.join(webGraph[entry]) def printIndex(index): print print '-*( Indices )*-' print for term,occurences in sorted(index.iteritems()): print '(' + term[0] + ', df:' + str(term[1]) + ') ->', print re.sub('(u)?\'', '', str(occurences))
class IndexerTest(unittest.TestCase): def setUp(self): self.store_mock = IndexStoreMock() self.tokenizer_mock = TokenizerMock() self.indexer = Indexer(self.store_mock, self.tokenizer_mock) def test_term_document_frequency(self): # Arrange term = "foo" document = uuid.uuid4() # Act self.indexer.term_document_frequency(document, term) # Assert self.assertEqual(1, self.store_mock.num_method_calls("term_document_frequency")) arguments = self.store_mock.get_arguments("term_document_frequency") self.assertEqual(document, arguments[0]) self.assertEqual(term, arguments[1]) def test_document_frequency_normalized(self): # Arrange term = "foo" document_frequency = 22 num_documents = 100 self.store_mock.set_document_frequency(document_frequency) self.store_mock.set_num_documents(num_documents) # Act result = self.indexer.document_frequency_normalized(term) # Assert self.assertEqual(1, self.store_mock.num_method_calls("document_frequency")) document_frequency_args = self.store_mock.get_arguments("document_frequency") self.assertEqual(term, document_frequency_args[0]) self.assertEqual(1, self.store_mock.num_method_calls("num_documents")) self.assertEqual(result, 0.22) def test_index_empty_text(self): # Arrange document = uuid.uuid4() text = "" self.tokenizer_mock.set_tokens([]) # Act self.indexer.index(text, document) # Assert self.assertEqual(1, self.tokenizer_mock.num_method_calls("tokenize")) tokenize_arguments = self.tokenizer_mock.get_arguments("tokenize") self.assertEqual(text, tokenize_arguments[0]) self.assertFalse(self.store_mock.was_called("add")) def test_index_one_token(self): # Arrange document = uuid.uuid4() text = "foo" self.tokenizer_mock.set_tokens([text]) # Act self.indexer.index(text, document) # Assert self.assertEqual(1, self.tokenizer_mock.num_method_calls("tokenize")) tokenize_arguments = self.tokenizer_mock.get_arguments("tokenize") self.assertEqual(text, tokenize_arguments[0]) self.assertEqual(1, self.store_mock.num_method_calls("add")) add_arguments = self.store_mock.get_arguments("add") self.assertEqual(document, add_arguments[0]) self.assertEqual(text, add_arguments[1]) def test_index_two_tokens(self): # Arrange document = uuid.uuid4() tokens = ["foo", "bar"] text = " ".join(tokens) self.tokenizer_mock.set_tokens(tokens) # Act self.indexer.index(text, document) # Assert self.assertEqual(1, self.tokenizer_mock.num_method_calls("tokenize")) tokenize_arguments = self.tokenizer_mock.get_arguments("tokenize") self.assertEqual(text, tokenize_arguments[0]) self.assertEqual(2, self.store_mock.num_method_calls("add")) add_arguments1 = self.store_mock.get_arguments("add", 1) self.assertEqual(document, add_arguments1[0]) self.assertEqual(tokens[0], add_arguments1[1]) add_arguments2 = self.store_mock.get_arguments("add", 2) self.assertEqual(document, add_arguments2[0]) self.assertEqual(tokens[1], add_arguments2[1]) def test_get_posting_list(self): # Arrange term = "foo" # Act self.indexer.get_posting_list(term) # Assert self.assertEqual(1, self.store_mock.num_method_calls("posting_list")) arguments = self.store_mock.get_arguments("posting_list") self.assertEqual(term, arguments[0]) def test_get_terms(self): # Arrange terms = {"foo", "bar"} document = uuid.uuid4() self.store_mock.set_terms(terms) # Act result = self.indexer.get_terms(document) # Assert self.assertEqual(1, self.store_mock.num_method_calls("get_terms")) arguments = self.store_mock.get_arguments("get_terms") self.assertEqual(document, arguments[0]) self.assertEqual(terms, result)
def setUp(self): self.store_mock = IndexStoreMock() self.tokenizer_mock = TokenizerMock() self.indexer = Indexer(self.store_mock, self.tokenizer_mock)
class FileManager: COL_TITLE = 0 COL_PATH = 1 COL_PIXBUF = 2 COL_TYPE = 3 store = None root = None index = None thumbnailer = None def __init__(self,root): self.store = gtk.ListStore(str, str, gtk.gdk.Pixbuf, str) self.root = os.path.abspath(root) self.index = Indexer(self.root) def search(self,query): """ Search the index for the given query query needs to be something the Whoosh query parser can parse, otherwise Whoosh exceptions are bubbled up """ self.stop_thumbnailer() self.store.clear() results = self.index.search(unicode(query)) for i, fields in enumerate(results): title = fields['title'] if(title == ''): title = os.path.basename(fields['path']); self.store.append([ title, fields['path'], self.get_icon(gtk.STOCK_FILE), 'jpg']) self.start_thumbnailer() def browse(self,folder): """ Browse the given folder folder needs to exist and be relative to the library root, otherwise a NoDirException is thrown """ self.stop_thumbnailer() self.store.clear() folder = folder.replace('..','') full = os.path.join(self.root,folder) full = os.path.abspath(full) imgre = re.compile('\.jpe?g$',re.IGNORECASE) if(not os.path.isdir(full)): raise NoDirException("No such directory in library: "+folder) # add upper dir if(folder): upper = os.path.dirname(folder); self.store.append([ '..', upper, self.get_icon(gtk.STOCK_GO_UP), 'dir']) for fl in os.listdir(full): if fl[0] == '.': continue; #skip hidden files fn = os.path.join(full,fl) rel = os.path.relpath(fn,self.root) title = os.path.basename(fn) if(os.path.isdir(fn)): self.store.append([ title, rel, self.get_icon(gtk.STOCK_DIRECTORY), 'dir']) elif(imgre.search(fn)): self.store.append([ title, rel, self.get_icon(gtk.STOCK_FILE), 'jpg']) self.start_thumbnailer() def start_thumbnailer(self): """ Start thumbnailing for the current ListStore Thumbnailing is done in a separate thread """ self.stop_thumbnailer() self.thumbnailer = GeneratorTask(self._create_thumbnails) self.thumbnailer.start() def stop_thumbnailer(self): """ Stop any running thumbnailer Always call this before the ListStore is cleared! """ if self.thumbnailer is not None: self.thumbnailer.stop() self.thumbnailer.wait() self.thumbnailer = None def _create_thumbnails(self): """ The thumbnailing process FIXME: reading addtional image info from exif might be sensible here """ for row in self.store: path = row[self.COL_PATH] ftype = row[self.COL_TYPE] fn = os.path.join(self.root,path) if(ftype == 'jpg'): buf = gtk.gdk.pixbuf_new_from_file_at_size(fn, 48, 48) row[self.COL_PIXBUF] = buf yield None def get_itemat(self,pos): """ Return the item at the given postion """ row = self.store[pos]; path = row[self.COL_PATH] ftype = row[self.COL_TYPE] fn = os.path.join(self.root,path) return {'fn':fn, 'ft': ftype} def get_nextimagepos(self,pos): """ Get the position of the next image (not dir) after the given position. If the given positon is None, the search sats at the beginning of the store FIXME there is probably a much more elegant way doing the whole iteration stuff, but I can't figure it out """ if(pos == None): pos = 0; else: pos = pos+1; try: rowiter = self.store.get_iter(pos); while rowiter != None: if(self.store.get_value(rowiter,self.COL_TYPE) == 'jpg'): return self.store.get_path(rowiter)[0]; self.store.iter_next(rowiter); except ValueError: # we're out of range pass return None def get_previmagepos(self,pos): """ Get the position of the next image (not dir) before the given position. FIXME there is probably a much more elegant way. And I have no idea how to iterate backwards anyway """ while (pos >=0): pos -= 1; try: rowiter = self.store.get_iter(pos); if(self.store.get_value(rowiter,self.COL_TYPE) == 'jpg'): return self.store.get_path(rowiter)[0]; except ValueError: # we're out of range pass return None def get_icon(self, name): """ Helper to load a stock icon """ theme = gtk.icon_theme_get_default() return theme.load_icon(name, 48, 0) def get_tagcloudstring(self): tags = self.index.tagcloud() cloud = '' for tag in sorted(tags): # style = 'size="%d" underline="none" foreground="blck"' % (5 + (tags[tag]*10)) # cloud += '<a href="tags:'+urllib.quote('"'+tag+'"')+'" underline="none" foreground="black">'+cgi.escape(tag)+'</a> '; cloud += '<a href="%s"><span size="%d" underline="none" foreground="black">%s</span></a> ' % ( urllib.quote('"'+tag+'"'), (10 + (tags[tag]*5))*1000, cgi.escape(tag) ) return cloud