Beispiel #1
0
def process_dataset(redis):
    import guess_language
    pages = redis.keys('page:*:raw_content')

    for page in pages:
        page_id = page.split(':')[1]
        extracted = redis.get('page:%s:extracted' % page_id)

        if extracted is None:
            html = redis.get(page)
            if html is not None and html != '':
                try:
                    content = extract_content(html)
                    lang = guess_language.guessLanguageName(content)
                    lang = lang.lower()
                    try:
                        print tag, "extracting", page_id
                        content = process_content(content, lang)
                    except Exception, e:
                        print tag, e
                        content = process_content(content, 'english')

                    redis.set('page:%s:extracted' % page_id, 1)
                    redis.set('page:%s:content' % page_id, content)
                except Exception:
                    pass
def filter_geo_location(picture_data):
    file_name = picture_data['file_name']
    picture = picture_data['picture']
    location = None
    # If it contains geo-location
    if 'place' in picture['data']:
        if picture['data']['place'] is not None:
            # topLanguageName, topLanguageCode, isReliable, textBytesFound, details = chronium_compact_language_detector.detect(picture['data']['description'])
            pin_language = guess_language.guessLanguageName(picture_data['board_description'])
            if pin_language == "Afrikaans" or pin_language == "Latin" or pin_language == "Catalan":
                pin_language = "UNKNOWN"
            # extract features we want:
            location = {'longitude': picture['data']['place']['longitude'],
                        'latitude': picture['data']['place']['latitude'],
                        'description': picture['data']['description'],
                        'user_name': picture['data']['pinner']['username'],
                        'user_full_name': picture['data']['pinner']['username'],
                        'user_id': picture['data']['pinner']['id'],
                        'city_name': picture['data']['place']['locality'],
                        'country_name': picture['data']['place']['country'],
                        'foursquare_category': picture['data']['place']['category'],
                        'repin_count': picture['data']['repin_count'],
                        'is_repin': picture['data']['is_repin'],
                        'board_url': picture['data']['board']['url'],
                        'pin_url': picture['data']['link'],
                        'date': picture['data']['created_at'],
                        'board_follower_count': picture_data['board_follower_count'],
                        'board_description': picture_data['board_description'],
                        'file_name': file_name,
                        'pin_language': pin_language

            }
    return location
Beispiel #3
0
def filter_geo_location(picture_data):
    file_name = picture_data['file_name']
    picture = picture_data['picture']
    location = None
    # If it contains geo-location
    if 'place' in picture['data']:
        if picture['data']['place'] is not None:
            # topLanguageName, topLanguageCode, isReliable, textBytesFound, details = chronium_compact_language_detector.detect(picture['data']['description'])
            pin_language = guess_language.guessLanguageName(
                picture_data['board_description'])
            if pin_language == "Afrikaans" or pin_language == "Latin" or pin_language == "Catalan":
                pin_language = "UNKNOWN"
            # extract features we want:
            location = {
                'longitude': picture['data']['place']['longitude'],
                'latitude': picture['data']['place']['latitude'],
                'description': picture['data']['description'],
                'user_name': picture['data']['pinner']['username'],
                'user_full_name': picture['data']['pinner']['username'],
                'user_id': picture['data']['pinner']['id'],
                'city_name': picture['data']['place']['locality'],
                'country_name': picture['data']['place']['country'],
                'foursquare_category': picture['data']['place']['category'],
                'repin_count': picture['data']['repin_count'],
                'is_repin': picture['data']['is_repin'],
                'board_url': picture['data']['board']['url'],
                'pin_url': picture['data']['link'],
                'date': picture['data']['created_at'],
                'board_follower_count': picture_data['board_follower_count'],
                'board_description': picture_data['board_description'],
                'file_name': file_name,
                'pin_language': pin_language
            }
    return location
Beispiel #4
0
    def save_page(content, url_expanded, params):
        page_id = params[0]
        pipe = params[1]

        rkey = '%s:%s:raw_content' % ('page', page_id)
        pipe.set(rkey, content.decode('utf-8', errors='ignore'))

        rkey = '%s:%s:expanded_url' % ('page', page_id)
        pipe.set(rkey, url_expanded)

        if content is not None and content != '':
            try:
                content = extract_content(content)
                lang = guess_language.guessLanguageName(content)
                lang = lang.lower()
                try:
                    content = process_content(content, lang)
                except Exception:
                    content = process_content(content, 'english')

                pipe.set('page:%s:extracted' % page_id, 1)
                pipe.set('page:%s:content' % page_id, content)
            except Exception:
                pass

        print F, "saved page", url_expanded
def checkLanguage(msg):
    
    guess = (guess_language.guessLanguageName(msg))
    if(guess == "UNKNOWN"):
        guess = "En"
        return guess
    else:
        guess = guess[:2]
        return guess
def messageinfo(message):
    ret = ""
    ret = getmessagetext(message) + "\n\n"
    language = "language" + SEP + guess_language.guessLanguageName(ret)
    for (mark, placeholder) in [(",", "comma"), (".", "full_stop"), ("!", "exclaimationmark"), ("?", "questionmark")]:
        ret = ret.replace(mark, mark + " " + SEP + placeholder + " ")
    ret += language + "\n"
    for header in ["subject"]:  # Headers, that are also content
        ret = ret.rstrip() + "\n"
        for instance in getheaders(message, header):
            ret += instance + " "
            for word in words.findall(instance):
                ret += header + SEP + word + " "
                ret += word + " "
    ret = ret.rstrip() + "\n"
    headerinfo = set()
    for header in message.keys():
        headerinfo.add("hasheader" + SEP + header.replace(".", "_").replace("-", "_"))
    for header in [
        "sender",
        "to",
        "cc",
        "x-mailer",
        "from",
        "importance",
        "precedence",
        "List-Id",
    ]:  # ,'sender','to','cc','bcc']:
        for instance in getheaders(message, header):
            instance += " " + instance.replace("@", "_").replace(".", "__")
            if header.startswith("x-"):
                header = header[2:]
            for word in words.findall(instance):
                if sum(c.isalpha() for c in word) > (len(word) / 3 * 2):
                    headerinfo.add(header + SEP + word)
    receivedheaders = "\n".join(getheaders(message, "received"))
    if getsenderip(receivedheaders):
        headerinfo.add("from_ip" + SEP + getsenderip(receivedheaders).replace(".", "_"))
    for k, v in getsenderlocation(receivedheaders).iteritems():
        if v:
            headerinfo.add(u"from_location_" + k + SEP + v.decode("utf-8").replace(" ", "_"))
    gender = getsendergender("\n".join(getheaders(message, "from")))
    headerinfo.add("from_gender" + SEP + str(gender))
    for contenttype in getcontenttypes(message):
        headerinfo.add("contains" + SEP + contenttype.replace("/", "_"))
    return ret + "\n" + " ".join(headerinfo)
    def test_guess(self):
        tests = [
            ("This is a test of the language checker", "en"),
            ("Verifions que le détecteur de langues marche", "fr"),
            ("Sprawdźmy, czy odgadywacz języków pracuje", "pl"),
            ("авай проверить  узнает ли наш угадатель русски язык", "ru"),
            ("La respuesta de los acreedores a la oferta argentina para salir del default no ha sido muy positiv", "es"),
             ("Сайлау нәтижесінде дауыстардың басым бөлігін ел премьер министрі Виктор Янукович пен оның қарсыласы, оппозиция жетекшісі Виктор Ющенко алды.", "kk"), # Kazakh
            ("милиция ва уч солиқ идораси ходимлари яраланган. Шаҳарда хавфсизлик чоралари кучайтирилган.", "uz"), # uzbek
            ("көрбөгөндөй элдик толкундоо болуп, Кокон шаарынын көчөлөрүндө бир нече миң киши нааразылык билдирди.", "ky"), # kyrgyz
            ("yakın tarihin en çekişmeli başkanlık seçiminde oy verme işlemi sürerken, katılımda rekor bekleniyor.", "tr"),
             ("Daxil olan xəbərlərdə deyilir ki, 6 nəfər Bağdadın mərkəzində yerləşən Təhsil Nazirliyinin binası yaxınlığında baş vermiş partlayış zamanı həlak olub.", "az"), # Azerbaijani

             (" ملايين الناخبين الأمريكيين يدلون بأصواتهم وسط إقبال قياسي على انتخابات هي الأشد تنافسا منذ عقود",  "ar"),
             ("Американське суспільство, поділене суперечностями, збирається взяти активну участь у голосуванні",  "uk"), # ukrainian
             ("Francouzský ministr financí zmírnil výhrady vůči nízkým firemním daním v nových členských státech EU",  "cs"), # czech
             ("biće prilično izjednačena, sugerišu najnovije ankete. Oba kandidata tvrde da su sposobni da dobiju rat protiv terorizma",  "hr"), # croatian
             (" е готов да даде гаранции, че няма да прави ядрено оръжие, ако му се разреши мирна атомна програма",  "bg"), # bulgarian
             ("на јавното мислење покажуваат дека трката е толку тесна, што се очекува двајцата соперници да ја прекршат традицијата и да се појават и на самиот изборен ден.",  "mk"), # macedonian
             ("în acest sens aparţinînd Adunării Generale a organizaţiei, în ciuda faptului că mai multe dintre solicitările organizaţiei privind organizarea scrutinului nu au fost soluţionate",  "ro"), # romanian
             ("kaluan ditën e fundit të fushatës në shtetet kryesore për të siguruar sa më shumë votues.",  "sq"), # albanian
             ("αναμένεται να σπάσουν παράδοση δεκαετιών και να συνεχίσουν την εκστρατεία τους ακόμη και τη μέρα των εκλογών",  "el"), # greek
             (" 美国各州选民今天开始正式投票。据信,",  "zh"), # chinese
             (" Die kritiek was volgens hem bitter hard nodig, omdat Nederland binnen een paar jaar in een soort Belfast zou dreigen te veranderen",  "nl"), # dutch
             ("På denne side bringer vi billeder fra de mange forskellige forberedelser til arrangementet, efterhånden som vi får dem ",  "da"), # danish
             ("Vi säger att Frälsningen är en gåva till alla, fritt och för intet.  Men som vi nämnt så finns det två villkor som måste",  "sv"), # swedish
             ("Nominasjonskomiteen i Akershus KrF har skviset ut Einar Holstad fra stortingslisten. Ytre Enebakk-mannen har plass p Stortinget s lenge Valgerd Svarstad Haugland sitter i",  "nb"), # norwegian
             ("on julkishallinnon verkkopalveluiden yhteinen osoite. Kansalaisten arkielämää helpottavaa tietoa on koottu eri aihealueisiin",  "fi"), # finnish
             ("Ennetamaks reisil ebameeldivaid vahejuhtumeid vii end kurssi reisidokumentide ja viisade reeglitega ning muu praktilise informatsiooniga",  "et"), # estonian
             ("Hiába jön létre az önkéntes magyar haderő, hiába nem lesz többé bevonulás, változatlanul fennmarad a hadkötelezettség intézménye",  "hu"), # hungarian
             ("հարաբերական",  "hy"), # armenian
             ("Hai vấn đề khó chịu với màn hình thường gặp nhất khi bạn dùng laptop là vết trầy xước và điểm chết. Sau đây là vài cách xử lý chú", "vi"),
             ("ii",  UNKNOWN),

             # This text has a mix of Hirigana, Katakana and CJK which requires the fix for issue:3 to classify correctly
             ("トヨタ自動車、フィリピンの植林活動で第三者認証取得 トヨタ自動車(株)(以下、トヨタ)は、2007年9月よりフィリピンのルソン島北部に位置するカガヤン州ペニャブラン", 'ja'),
        ]

        for text, name in tests:
            self.assertEquals(name, guessLanguage(text))

        text = "Verifions que le détecteur de langues marche"
        self.assertEquals('fr', guessLanguageTag(text))
        self.assertEquals('French', guessLanguageName(text))
        self.assertEquals(26150, guessLanguageId(text))
        self.assertEquals(('fr', 26150, 'French'), guessLanguageInfo(text))
Beispiel #8
0
def process(text):
    """
    This somewhat of a complicated beast. Does all pre-processing for a chunk of text, including:
    
    * cleaning the string
    * Language detection
    * Tokenizing
    * Application of a stop-list (in English or Spanish)
    * Stemming (snowball in English or Spanish)
    
    Args:
    text : (string or clean Unicode) text to be processed
    
    Returns:
    tokens: a list of (token, sentiment) where token is a stemmed word, and sentiment is the explicit sentiment for the token (if found in the corpus)
    
    """

    try:
        lang = gl.guessLanguageName(text).lower()
        #print lang
    except:
        return []

    tokens = word_tokenize(strip_punctuation(text.lower()))
    out_tokens = []

    if lang == 'english':
        stoplist = english_stoplist.stoplist
        stemmer = SnowballStemmer('english')
    elif lang == 'spanish':
        stoplist = spanish_stoplist.stoplist
        stemmer = SnowballStemmer('spanish')
    else:
        return []

    for token in tokens:
        if (token not in stoplist) and (not token.startswith('http')) and (
                len(token) > 3):
            tt = stemmer.stem(token)
            out_tokens.append(
                (tt))  #, senti.sentiment_token(tt,language=lang)))

    return out_tokens
Beispiel #9
0
def process(text):
    """
    This somewhat of a complicated beast. Does all pre-processing for a chunk of text, including:
    
    * cleaning the string
    * Language detection
    * Tokenizing
    * Application of a stop-list (in English or Spanish)
    * Stemming (snowball in English or Spanish)
    
    Args:
    text : (string or clean Unicode) text to be processed
    
    Returns:
    tokens: a list of (token, sentiment) where token is a stemmed word, and sentiment is the explicit sentiment for the token (if found in the corpus)
    
    """
    
    
    try: 
        lang=gl.guessLanguageName(text).lower()
        #print lang
    except:
        return []
    
    tokens = word_tokenize(strip_punctuation(text.lower()))
    out_tokens=[]
        
    if lang == 'english':
        stoplist=english_stoplist.stoplist
        stemmer = SnowballStemmer('english')
    elif lang == 'spanish':
        stoplist=spanish_stoplist.stoplist
        stemmer = SnowballStemmer('spanish')
    else:
        return []
        
    for token in tokens:
        if (token not in stoplist) and (not token.startswith('http')) and (len(token) > 3):
            tt=stemmer.stem(token)
            out_tokens.append((tt, senti.sentiment_token(tt,language=lang)))
                    
    return out_tokens   
def process(text):
    try: 
        lang=gl.guessLanguageName(text)
        #print lang
    except:
        return []
        
    ## only keep the English tweets 
    if lang != 'English':
        return []
    else:
        tokens=[]
        for token in text.lower().split(' '):
            try:
                token=strip_punctuation(token).lower()
            except TypeError:
                pass
                
            if (token not in stoplist) and (not token.startswith('@')): 
                tokens.append(stem.stem(token))
                
        return tokens
def process(text):
    try:
        lang = gl.guessLanguageName(text)
        #print lang
    except:
        return []

    ## only keep the English tweets
    if lang != 'English':
        return []
    else:
        tokens = []
        for token in text.lower().split(' '):
            try:
                token = strip_punctuation(token).lower()
            except TypeError:
                pass

            if (token not in stoplist) and (not token.startswith('@')):
                tokens.append(stem.stem(token))

        return tokens
def messageinfo(message):
    ret = ''
    ret = getmessagetext(message) + '\n\n'
    if USELANG:
        language = 'language' +SEP+ guess_language.guessLanguageName(ret)
    for (mark,placeholder) in [(',','comma'),('.','full_stop'),('!','exclaimationmark'),('?','questionmark')]:
        ret = ret.replace(mark, mark+' '+SEP + placeholder+' ')
    if USELANG:
        ret += language + '\n'
    for header in ['subject']: # Headers, that are also content
        ret = ret.rstrip() + '\n'
        for instance in getheaders(message,header):
            ret += instance + ' '
            for word in words.findall(instance):
                ret += header + SEP + word +' '
                ret += word +' '
    ret = ret.rstrip() + '\n'
    headerinfo = set()
    for header in message.keys():
        headerinfo.add('hasheader'+ SEP + header.replace('.','_').replace('-','_'))
    for header in ['sender','to','cc','x-mailer','from','importance','precedence','List-Id']: # ,'sender','to','cc','bcc']:
        for instance in getheaders(message,header):
            instance += ' '+ instance.replace('@','_').replace('.','__')
            if header.startswith('x-'): header = header[2:]
            for word in words.findall(instance):
                if sum(c.isalpha() for c in word) > (len(word)/3*2):
                    headerinfo.add(header + SEP + word)
    receivedheaders = '\n'.join(getheaders(message,'received'))
    if getsenderip(receivedheaders):
        headerinfo.add('from_ip'+ SEP + getsenderip(receivedheaders).replace('.','_'))
    for k,v in getsenderlocation(receivedheaders).iteritems():
        if v:
            headerinfo.add(u'from_location_'+k+ SEP + v.decode('utf-8').replace(' ','_'))
    gender = getsendergender('\n'.join(getheaders(message,'from')))
    headerinfo.add('from_gender' +SEP +str(gender))
    for contenttype in getcontenttypes(message):
        headerinfo.add('contains' + SEP + contenttype.replace('/','_'))
    return ret+'\n'+' '.join(headerinfo)
Beispiel #13
0
                html = open(webpage).read()
                soup = BeautifulSoup(html)

                links = soup.findAll('a', href=True)
                for tag in links:
                    link = tag.get('href',None)
                    if link != None:
                        urls.append(link)
                
                for script in soup(["script", "style"]):
                    script.extract()
                text = soup.getText()
                lines = (line.strip() for line in text.splitlines())
                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
                text = '\n'.join(chunk for chunk in chunks if chunk)
                language = guess_language.guessLanguageName(text.encode('utf8'))
                languages.append(language)
                size = os.path.getsize(webpage)
                words = text.encode('utf8').split()
                
                report = open('%s%s' % (str(timestamp), '-hard_metrics.txt'), 'a')
                report.write(webpage)
                report.write('%s%s' % ('\nLanguage: ', language))
                report.write('%s%s' % ('\nNumber of links: ', str(len(urls))))
                report.write('%s%s' % ('\nSize of file: ', str(size)))
                report.write('%s%s' % ('\nAmount of words: ', str(len(words))))
                report.write('\n==============================================================\n')
                report.close()
                sizes.append(size)
                
                if len(urls)>most:
Beispiel #14
0
     [args.source_dir_data, os.sep, '*.dat']))
 num_files = len(author_filenames)  # processing feedback
 i = 0  # processing feedback
 for author_filename in author_filenames:
     sys.stdout.write(''.join(
         ['\t', str(i), '/',
          str(num_files), ' files processed\r']))
     i += 1
     logging.debug(''.join(['Processing ', author_filename, ' file ...']))
     messages = messages_persistence.read(author_filename)
     messages_filtered = []
     for message in messages:
         logging.debug(''.join(
             ['Detecting language for tweet: ', message['tweet']]))
         try:  # code guess-language breaks for some tweets
             detected_language = guess_language.guessLanguageName(
                 message['tweet'])
         except Exception as e:
             logging.warning(
                 'guess-language library error in detecting language for tweet: '
                 + message['tweet'])
             logging.warning('Exception message: ' + str(e))
             logging.warning('Exception stack trace:')
             traceback.print_tb(sys.exc_info()[2])
             detected_language = None
         if detected_language:
             logging.debug(''.join(
                 ['\tLanguage \'', detected_language, '\' detected.']))
             if detected_language == args.language:
                 messages_filtered.append(message)
         else:
             logging.warning('No language detected for tweet: ' +
Beispiel #15
0
def detect_lang(text):
    '''
    @param text: String to detect its natural language
    '''
    return gl.guessLanguageName(text)
    
Beispiel #16
0
def import_from_jigsaw_root(root, user, project):
    order = 1
    for node in root:
        id = node.findtext("docID")
        text = node.findtext("docText")
        #text.tag = 'div'
        docs = list(Document.objects.filter(import_id=id))
        if len(docs)==0:
            d = Document(creator=user, last_updater=user, import_id=id, description=id, ordering=order, project=project)
            d.save()
            t = Transcript(creator=user, last_updater=user, document=d, content=text)
            t.save()
        else:
            d = docs[0]
            d.last_updater = user
            d.description = id
            d.order = order
        d.language = gl.guessLanguageName(text)
        order += 1
        d.save()

        date = node.find("docDate")
        if date is not None and date.text:
            try:
#                pdb.set_trace()
                dt = datetime.strptime(date.text, "%m/%d/%Y")
                res = list(Topic.objects.filter(date=dt))
                cnt = len(res)
                if cnt==0:
                    normalized = "Date: %d/%d/%d" % (dt.year, dt.month, dt.day)
                    t=get_or_create_topic(user, normalized, 'EVT', project, dt)
                    t.date = dt
                    #t=Topic(creator=user, last_updater=user, preferred_name=normalized, date=dt, type='EVT',project=project)
                    t.save()
                else:
                    t=res[0]
                d.related_topics.create(creator=user, topic=t)
            except ValueError as e:
                pass
        for p in node.findall("concept"):
            name=p.text
            t=get_or_create_topic(user, name, 'TAG',project)
            if t:
        	d.related_topics.create(creator=user, topic=t)
            else:
                print "Cannot create topic(%s,type=%s)" % (t,'TAG')
            
        for p in node.findall("person"):
            name=p.text
            t=get_or_create_topic(user, name, 'PER',project)
            if t:
                d.related_topics.create(creator=user, topic=t)
            else:
                print "Cannot create topic(%s,type=%s)" % (t,'PER')
        for p in node.findall("location"):
            name=p.text
            t=get_or_create_topic(user, name, 'PLA',project)
            if t:
                d.related_topics.create(creator=user, topic=t)
            else:
                print "Cannot create topic(%s,type=%s)" % (t,'PLA')
Beispiel #17
0
 def test_guess_langauge(self):
     with open("input.txt", 'r') as textfile:
         print "guess_language: "
         for text in textfile:
             print gl.guessLanguageName(text)
         print
Beispiel #18
0
 def test_guess_langauge(self):
     with open("input.txt", 'r') as textfile:
         print "guess_language: "
         for text in textfile:
             print gl.guessLanguageName(text)
         print