Esempio n. 1
0
 def _parse_general_search(self, search_results):
     '''Finds the item to download from meta-search results'''
     parsed_results = {}
     for item in search_results('item'):
         description = str(item.description)
         seeds_index = description.index('Seeds') + 7
         cut_description = description[seeds_index:]
         space_index = cut_description.index(' ')
         seeds = cut_description[:space_index]
         seeds = seeds.replace(",", "")
         seeds = int(seeds)
         file_type = str(item.category).lower()
         bad_file_types = ['flac', 'wma']
         if file_type not in bad_file_types and seeds >= 5:
             title = item.title
             title = remove_html_tags(str(title))
             title = remove_entities(title)
             #guid is the url of the 'choose a tracker'
             #page on torrentz
             guid = item.guid
             guid = remove_html_tags(str(guid))
             parsed_results[title] = guid
     if parsed_results == {}:
         raise DownloaderError('No valid results for search term')
     return parsed_results
 def _parse_general_search(self, search_results):
     '''Finds the item to download from meta-search results'''
     parsed_results = {}
     for item in search_results('item'):
         description = str(item.description)
         seeds_index = description.index('Seeds') + 7
         cut_description = description[seeds_index:]
         space_index = cut_description.index(' ')
         seeds = cut_description[:space_index]
         seeds = seeds.replace(",", "")
         seeds = int(seeds)
         file_type = str(item.category).lower()
         bad_file_types = ['flac', 'wma']
         if file_type not in bad_file_types and seeds >= 5:
             title = item.title
             title = remove_html_tags(str(title))
             title = remove_entities(title)
             #guid is the url of the 'choose a tracker'
             #page on torrentz
             guid = item.guid
             guid = remove_html_tags(str(guid))
             parsed_results[title] = guid
     if parsed_results == {}:
         raise DownloaderError('No valid results for search term')
     return parsed_results
Esempio n. 3
0
def generate_documents():
    events = r.keys('event:*:title')
    for event_key in events:
        event_id = id(event_key)
        lang = r.get('event:' + event_id + ':lang')
        docs = r.keys('document:*:' + event_id)
        documents[event_id] = []
        for doc_key in docs:
            doc_id = id(doc_key)
            tweet_ids = r.lrange('document:' + doc_id + ':tweets', 0, -1)
            document = []
            for tweet_id in tweet_ids:
                # esto se puede mejorar...
                tweet = utils.remove_entities(tweet_id)
                tweet = parser.unescape(' '.join(tweet.split()))
                if len(tweet) == 0 or len(tweet.split()) == 0:
                    continue
                tweet = utils.strip_accents(tweet)
                tweet = utils.remove_stopwords(tweet, lang)
                tweet = ' '.join([stemmers[lang].stem(token) for token in tweet.split()])
                document.append(tweet)
            documents[event_id].append(' '.join(document))
Esempio n. 4
0
def generate_documents_for(event_id):
    lang = r.get('event:' + event_id + ':lang')
    if lang is None:
        lang = 'spanish'
    docs = r.keys('document:*:' + event_id)
    documents[event_id] = []
    documents_ids[event_id] = []

    keys = []
    for eid in docs:
        keys.append(id(eid))

    docs = set(keys)
    for doc_id in docs:
        #doc_id = id(doc_key)

        # fb no se dejo resolver, y quedan muchos documentos apuntando a unsuportedbrowser
        # se ignora fb mientras no se arregle este problema
        url = r.get('document:%s:url' % doc_id)
        if urlparse(url).netloc == 'www.facebook.com':
            continue

        documents_real_ids.append(doc_id)
        tweet_ids = r.lrange('document:' + doc_id + ':tweets', 0, -1)
        documents_ids[event_id].append(tweet_ids)

        document = []
        for tweet_id in tweet_ids:
            # esto se puede mejorar...
            tweet = utils.remove_entities(tweet_id)
            tweet = parser.unescape(' '.join(tweet.split()))
            if len(tweet) == 0 or len(tweet.split()) == 0:
                continue
            tweet = utils.strip_accents(tweet)
            tweet = utils.remove_stopwords(tweet, lang)
            tweet = ' '.join([stemmers[lang].stem(token) for token in tweet.split()])
            document.append(tweet)
        documents[event_id].append(' '.join(document))