def _parse_general_search(self, search_results): '''Finds the item to download from meta-search results''' parsed_results = {} for item in search_results('item'): description = str(item.description) seeds_index = description.index('Seeds') + 7 cut_description = description[seeds_index:] space_index = cut_description.index(' ') seeds = cut_description[:space_index] seeds = seeds.replace(",", "") seeds = int(seeds) file_type = str(item.category).lower() bad_file_types = ['flac', 'wma'] if file_type not in bad_file_types and seeds >= 5: title = item.title title = remove_html_tags(str(title)) title = remove_entities(title) #guid is the url of the 'choose a tracker' #page on torrentz guid = item.guid guid = remove_html_tags(str(guid)) parsed_results[title] = guid if parsed_results == {}: raise DownloaderError('No valid results for search term') return parsed_results
def generate_documents(): events = r.keys('event:*:title') for event_key in events: event_id = id(event_key) lang = r.get('event:' + event_id + ':lang') docs = r.keys('document:*:' + event_id) documents[event_id] = [] for doc_key in docs: doc_id = id(doc_key) tweet_ids = r.lrange('document:' + doc_id + ':tweets', 0, -1) document = [] for tweet_id in tweet_ids: # esto se puede mejorar... tweet = utils.remove_entities(tweet_id) tweet = parser.unescape(' '.join(tweet.split())) if len(tweet) == 0 or len(tweet.split()) == 0: continue tweet = utils.strip_accents(tweet) tweet = utils.remove_stopwords(tweet, lang) tweet = ' '.join([stemmers[lang].stem(token) for token in tweet.split()]) document.append(tweet) documents[event_id].append(' '.join(document))
def generate_documents_for(event_id): lang = r.get('event:' + event_id + ':lang') if lang is None: lang = 'spanish' docs = r.keys('document:*:' + event_id) documents[event_id] = [] documents_ids[event_id] = [] keys = [] for eid in docs: keys.append(id(eid)) docs = set(keys) for doc_id in docs: #doc_id = id(doc_key) # fb no se dejo resolver, y quedan muchos documentos apuntando a unsuportedbrowser # se ignora fb mientras no se arregle este problema url = r.get('document:%s:url' % doc_id) if urlparse(url).netloc == 'www.facebook.com': continue documents_real_ids.append(doc_id) tweet_ids = r.lrange('document:' + doc_id + ':tweets', 0, -1) documents_ids[event_id].append(tweet_ids) document = [] for tweet_id in tweet_ids: # esto se puede mejorar... tweet = utils.remove_entities(tweet_id) tweet = parser.unescape(' '.join(tweet.split())) if len(tweet) == 0 or len(tweet.split()) == 0: continue tweet = utils.strip_accents(tweet) tweet = utils.remove_stopwords(tweet, lang) tweet = ' '.join([stemmers[lang].stem(token) for token in tweet.split()]) document.append(tweet) documents[event_id].append(' '.join(document))