def test_detect_language_mixed_languages(self):
        text = """
            # spanish
            Lionel Andrés Messi Cuccittini (Rosario, 24 de junio de 1987),
            conocido como Leo Messi, es un futbolista argentino11 que juega
            como delantero en el Fútbol Club Barcelona y en la selección
            argentina, de la que es capitán.

            # german
            Messi spielt seit seinem 14. Lebensjahr für den FC Barcelona.
            Mit 24 Jahren wurde er Rekordtorschütze des FC Barcelona, mit 25
            der jüngste Spieler in der La-Liga-Geschichte, der 200 Tore
            erzielte.
            
            # english
            Four score and seven years ago our fathers brought forth on this 
            continent, a new nation, conceived in Liberty, and dedicated to the 
            proposition that all men are created equal. Now we are engaged in a great 
            civil war, testing whether that nation, or any nation so conceived and 
            so dedicated, can long endure. We are met on a great battle-field of that war. 
            We have come to dedicate a portion of that field, as a final resting 
            place for those who here gave their lives that that nation might live.
            
        """
        result = detect_language(text, self.languages)
        self.assertEqual(result, 'English')
Esempio n. 2
0
def get_name_fields(layer,
                    fields,
                    field_types,
                    min_completeness=0.8,
                    min_uniqueness=0.8):
    print("starting get_name_field with:")
    print(fields)
    print(field_types)
    name_fields = []
    for i, field in enumerate(fields):
        field = fields[i]
        field_lower = field.lower()
        field_type = field_types[i]
        values = get_values_list(layer, field)
        values_set = set(values)
        number_of_unique_values = len(values_set)
        completeness = float(len([value for value in values if value
                                  ])) / float(number_of_values)
        uniqueness = float(number_of_unique_values) / float(number_of_values)
        if "pc" not in field_lower and field_type == "OFTString" and completeness > min_completeness and uniqueness > min_uniqueness and not isGibberish(
                values_set) and number_of_unique_values > 10 and isNameList(
                    values_set):
            language = detect_language(values)
            name_fields.append({
                "name": field,
                "uniqueness": uniqueness,
                "language": language
            })

    # sort namefields by uniqueness
    name_fields = sorted(name_fields,
                         key=lambda namefield: -1 * namefield["uniqueness"])

    print("name_fields are", name_fields)
    return name_fields
    def post(self, request):
        form = DetectLanguageForm(request.POST)
        language = None
        if form.is_valid():
            language = detect_language(form.data['phrase'])

        return render(request, self.template_name, {'form': form, 'language':language})
Esempio n. 4
0
    def parse_conversation(self, response):
        speakers = response.css(
            'body > div.container > div.msgBlock > table > tr > td > b'
        ).extract()
        utterances = filter(
            lambda x: x != '\r' and '[Message edited by' not in x,
            response.css('td[id=post]::text').extract())
        clean_utterances_count = 0
        # Check if conversation contains more than 1 utterance
        if (len(utterances) > 1):
            corpus = "<s>"
            for index in range(len(speakers)):
                parsed_clean_utterance = self.clean_utterance(
                    utterances[index])
                # Check if utterance is not empty
                if (parsed_clean_utterance.isspace() is False):
                    # Check utterance language
                    if detect_language(parsed_clean_utterance) != 'English':
                        clean_utterances_count += 1
                        corpus += '<utt uid="' + str(
                            self.generate_uid(speakers[index])
                        ) + '">' + parsed_clean_utterance + '</utt>'
            if (clean_utterances_count > 1):
                corpus += "</s>\n"
                self.text_file.write(corpus)

        # Visit all pages of the current post
        next_page_urls = response.css(
            'body > div.container > div.msgBlock > table > tr.msgHeader > td > div.fnavhead > div.fnavnum > a::attr(href)'
        ).extract()
        for next_url in next_page_urls:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(url=next_url,
                                 callback=self.parse_conversation)
def f_lan(s):
    """
    :param s: string to be processed
    :return: boolean (s is English)
    """
    # some reviews are actually english but biased toward french
    return detect_language(s) in {'English', 'French'}
Esempio n. 6
0
def generate_output(language):
  input_lang = language[0].lower()
  vw_in = pd.read_csv('input/vw_clean_' + input_lang + '.csv')
  df = pd.DataFrame(data=vw_in)

  vw_out = pd.DataFrame()
  lang = ''

  if str(input_lang).lower() == 'en':
    lang = 'english'
  else: 
    lang = 'spanish'

  print('Processing dataset in', lang)

  for index, row in df.iterrows():
    try:
      if len(str(row['text']).split()) > 1:
        if lang_detector.detect_language(str(row['text'])) == lang:
          if not (str(row['text'])).lower().startswith('rt'): # we avoid RT
            text = preprocessor.process_tweet(str(row['text'])).replace('volkswagen', ' ').replace('vw', ' ')
            vw_out = vw_out.append({'id':row['id'], 'created_at':row['created_at'],
				'name':row['name'], 'screen_name':row['screen_name'],
				'verified':row['followers_count'], 'friends_count':row['friends_count'],
				'text':text, 'description':row['description'],
				'lang':row['lang'], 'time_zone':row['time_zone'],
				'location':row['location']}, ignore_index = True)
    except KeyboardInterrupt:
      print('Generating file in', lang)
      vw_out.to_csv('output/vw_clean_' + input_lang + '_rechecked.csv')
      sys.exit()

  print('Generating file in', lang)
  vw_out.to_csv('output/vw_clean_' + input_lang + '_rechecked.csv')
Esempio n. 7
0
def f_lan(s):
    """
    :param s: string to be processed
    :return: boolean (s is English)
    """

    # some reviews are french
    return detect_language(s) in {'English'}
Esempio n. 8
0
def f_lan(s):
    """
    :param s: string to be processed
    :return: boolean (s is English)
    """

    # some reviews are french but incorrectly have been labeled as EN
    return detect_language(s) in {'English'}  # {'English',French'}
Esempio n. 9
0
 def check_lang(self, text, lang_list={'English', 'French'}):
     """
     Function for language detection.
         @param text (string): text to be processed.
         @param lang_list(set): allowed languages for the text (Default: English + French)
         @return boolean: (True if text belongs to any one of the languages in lang_list) 
     """
     return detect_language(s) in lang_list
    def post(self, request):
        form = DetectLanguageForm(request.POST)
        language = None
        if form.is_valid():
            language = detect_language(form.data['phrase'])

        return render(request, self.template_name, {
            'form': form,
            'language': language
        })
Esempio n. 11
0
def spell_checker(text, depth):
    words = get_words_from_text(text)
    incorrect_words = []
    for word in words:
        lang = detect_language(word)
        if lang != 'unknown':
            correct_word = get_correct_word(word, lang, depth)
            if correct_word != word:
                incorrect_words.append(word + " - " + correct_word)
    return incorrect_words
Esempio n. 12
0
def clean(df):
    
    from nltk.corpus import stopwords
    import re
    from googletrans import Translator
    from language_detector import detect_language
    
    #remove $ and @mentions
    df['tweet'] = df['tweet'].map(lambda x: re.sub(r'\$[A-Za-z0-9]*','',x))
    #df['tweet'] = df['tweet'].map(lambda x: re.sub(r'\#[A-Za-z0-9]*','',x))
    df['tweet'] = df['tweet'].map(lambda x: re.sub(r'\@[A-Za-z0-9]*','',x))   
    
    #to datetime format
    df.drop( df[df['date'] == 'date'].index , inplace=True)
    df['date'] = df['date'].apply(lambda x: to_datetime(x) if type(x)== str else x)
    
    #drop first column 
    df= df.drop(df.columns[0], axis=1)
    
    #remove https: links
    df['tweet'] = df['tweet'].str.replace(r'https?://[^\s<>"]+|www\.[^\s<>"]+', "")
    #df['tweet'] = df['tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
    
    #remove punctuation 
    df['tweet'] = df['tweet'].str.replace('[^\w\s]','')
    
    #to lowercase
    df['tweet'] = df['tweet'].map(lambda s:s.lower() if type(s) == str else s)
    
    #delete line breaks
    df['tweet'].replace(r'\s+|\\n', ' ', regex=True, inplace=True) 
    
    df= df.reset_index(drop=True)
    
    #drop all foreign languages
    '''for x in range(len(df['tweet'])):
        if detect_language(df['tweet'][x]) != 'English':
            df = df.drop(index= x)
        else: 
            continue'''
    
    df['tweet'] = df['tweet'].map(lambda x: x.replace(x,'') if detect_language(x) != 'English' else x)
    df['tweet'].replace('', np.nan, inplace=True)
    
    #drop nulls
    df= df.dropna()
    
    df= df.reset_index(drop=True)
    
    ''' #remove stop words
    stop = stopwords.words('english')
    df['tweet'] = df['tweet'].apply(lambda x: [w.strip() for w in x if w.strip() not in stop])
    df['tweet'].apply(lambda x: [item for item in x if item not in stop])'''
    
    return df
Esempio n. 13
0
 def test_detect_language_english_with_module_language_specification(self):
     text = """
             Shakespeare was born and brought up in Stratford-upon-Avon,
             Warwickshire. At the age of 18, he married Anne Hathaway, 
             with whom he had three children: Susanna, and twins Hamnet and Judith.
             Sometime between 1585 and 1592, he began a successful career in London 
             as an actor, writer, and part-owner of a playing company called the Lord
             Chamberlain's Men, later known as the King's Men.
             """
     result = detect_language(text, LANGUAGES)
     self.assertEqual(result, 'English')
 def test_detect_language_english(self):
     text = """
         Messi plays since he was 14 years for the FC Barcelona .
         At 24, he was top scorer of FC Barcelona , 25
         the youngest player in La Liga history , the 200 goals
         scored . Meanwhile, Messi is the only player over 300
         achieved top-flight and is therefore scorer
         La Liga .
     """
     result = detect_language(text, self.languages)
     self.assertEqual(result, 'English')
Esempio n. 15
0
 def test_detect_language_english(self):
     text = """
         Messi plays since he was 14 years for the FC Barcelona .
         At 24, he was top scorer of FC Barcelona , 25
         the youngest player in La Liga history , the 200 goals
         scored . Meanwhile, Messi is the only player over 300
         achieved top-flight and is therefore scorer
         La Liga .
     """
     result = detect_language(text, self.languages)
     self.assertEqual(result, 'English')
Esempio n. 16
0
 def test_detect_language_german(self):
     text = """
         Messi spielt seit seinem 14. Lebensjahr für den FC Barcelona.
         Mit 24 Jahren wurde er Rekordtorschütze des FC Barcelona, mit 25
         der jüngste Spieler in der La-Liga-Geschichte, der 200 Tore
         erzielte. Inzwischen hat Messi als einziger Spieler mehr als 300
         Erstligatore erzielt und ist damit Rekordtorschütze
         der Primera División.
     """
     result = detect_language(text, self.languages)
     self.assertEqual(result, 'German')
 def test_detect_language_german(self):
     text = """
         Messi spielt seit seinem 14. Lebensjahr für den FC Barcelona.
         Mit 24 Jahren wurde er Rekordtorschütze des FC Barcelona, mit 25
         der jüngste Spieler in der La-Liga-Geschichte, der 200 Tore
         erzielte. Inzwischen hat Messi als einziger Spieler mehr als 300
         Erstligatore erzielt und ist damit Rekordtorschütze
         der Primera División.
     """
     result = detect_language(text, self.languages)
     self.assertEqual(result, 'German')
 def test_detect_language_english(self):
     text = """
          Four score and seven years ago our fathers brought forth on this 
          continent, a new nation, conceived in Liberty, and dedicated to the 
          proposition that all men are created equal. Now we are engaged in a great 
          civil war, testing whether that nation, or any nation so conceived and 
          so dedicated, can long endure. We are met on a great battle-field of that war. 
          We have come to dedicate a portion of that field, as a final resting 
          place for those who here gave their lives that that nation might live.
     """
     result = detect_language(text, self.languages)
     self.assertEqual(result, 'English')
Esempio n. 19
0
 def test_detect_language_spanish(self):
     text = """
         Lionel Andrés Messi Cuccittini (Rosario, 24 de junio de 1987),
         conocido como Leo Messi, es un futbolista argentino11 que juega
         como delantero en el Fútbol Club Barcelona y en la selección
         argentina, de la que es capitán. Considerado con frecuencia el
         mejor jugador del mundo y calificado en el ámbito deportivo como el
         más grande de todos los tiempos, Messi es el único futbolista en la
         historia que ha ganado cinco veces el FIFA Balón de Oro –cuatro de
         ellos en forma consecutiva– y el primero en
         recibir tres Botas de Oro.
     """
     result = detect_language(text, self.languages)
     self.assertEqual(result, 'Spanish')
 def test_detect_language_spanish(self):
     text = """
         Lionel Andrés Messi Cuccittini (Rosario, 24 de junio de 1987),
         conocido como Leo Messi, es un futbolista argentino11 que juega
         como delantero en el Fútbol Club Barcelona y en la selección
         argentina, de la que es capitán. Considerado con frecuencia el
         mejor jugador del mundo y calificado en el ámbito deportivo como el
         más grande de todos los tiempos, Messi es el único futbolista en la
         historia que ha ganado cinco veces el FIFA Balón de Oro –cuatro de
         ellos en forma consecutiva– y el primero en
         recibir tres Botas de Oro.
     """
     result = detect_language(text, self.languages)
     self.assertEqual(result, 'Spanish')
 def test_detect_language_english(self):
     # NOTE: You will first need to define a new "English" language
     #       in the languages.py module.
     text = """
         # english
         Lionel Andrés 'Leo' Messi is an Argentine professional footballer
         who plays as a forward for Spanish club FC Barcelona and the
         Argentina national team. Often considered the best player in the
         world and rated by many in the sport as the greatest of all time,
         Messi is the only football player in history to win five FIFA
         Ballons, four of which he won consecutively, and the first player
         to win three European Golden Shoes.
     """
     result = detect_language(text, LANGUAGES)
     self.assertEqual(result, 'English')
 def test_detect_language_english(self):
     # NOTE: You will first need to define a new "English" language
     #       in the languages.py module.
     text = """
         # english
         Lionel Andrés 'Leo' Messi is an Argentine professional footballer
         who plays as a forward for Spanish club FC Barcelona and the
         Argentina national team. Often considered the best player in the
         world and rated by many in the sport as the greatest of all time,
         Messi is the only football player in history to win five FIFA
         Ballons, four of which he won consecutively, and the first player
         to win three European Golden Shoes.
     """
     result = detect_language(text, LANGUAGES)
     self.assertEqual(result, 'English')
Esempio n. 23
0
    def test_detect_language_mixed_languages(self):
        text = """
            # spanish
            Lionel Andrés Messi Cuccittini (Rosario, 24 de junio de 1987),
            conocido como Leo Messi, es un futbolista argentino11 que juega
            como delantero en el Fútbol Club Barcelona y en la selección
            argentina, de la que es capitán.

            # german
            Messi spielt seit seinem 14. Lebensjahr für den FC Barcelona.
            Mit 24 Jahren wurde er Rekordtorschütze des FC Barcelona, mit 25
            der jüngste Spieler in der La-Liga-Geschichte, der 200 Tore
            erzielte.
        """
        result = detect_language(text, self.languages)
        self.assertEqual(result, 'Spanish')
    def test_detect_language_mixed_languages(self):
        text = """
            # spanish
            Lionel Andrés Messi Cuccittini (Rosario, 24 de junio de 1987),
            conocido como Leo Messi, es un futbolista argentino11 que juega
            como delantero en el Fútbol Club Barcelona y en la selección
            argentina, de la que es capitán.

            # german
            Messi spielt seit seinem 14. Lebensjahr für den FC Barcelona.
            Mit 24 Jahren wurde er Rekordtorschütze des FC Barcelona, mit 25
            der jüngste Spieler in der La-Liga-Geschichte, der 200 Tore
            erzielte.
        """
        result = detect_language(text, self.languages)
        self.assertEqual(result, 'Spanish')
Esempio n. 25
0
def update_languages(df: pd.DataFrame) -> pd.DataFrame:
    """
    Update the language columns of the DF with the language of article.

    Args:
        df (pd.DataFrame): The dataframe, with a columns "title".

    Returns:
        pd.DataFrame: DF with a new column "lang".
    """
    languages = []
    for index, row in df.iterrows():
        try:
            lang = detect_language(row.title)[0:2]
        except Exception:
            lang = None
        languages.append(lang)
    df['lang'] = languages
    del languages

    return df
Esempio n. 26
0
def get_name_fields(layer, fields, field_types, min_completeness=0.8, min_uniqueness=0.8):
    print "starting get_name_field with:"
    print fields
    print field_types
    name_fields = []
    for i, field in enumerate(fields):
        field = fields[i]
        field_lower = field.lower()
        field_type = field_types[i]
        values = [value.lower() if isinstance(value, str) or isinstance(value, unicode) else value for value in layer.get_fields(field)]
        values_set = set(values)
        number_of_unique_values = len(values_set)
        completeness = float(len([value for value in values if value])) / float(number_of_values)
        uniqueness = float(number_of_unique_values) / float(number_of_values)
        if "pc" not in field_lower and field_type == "OFTString" and completeness > min_completeness and uniqueness > min_uniqueness and not isGibberish(values_set) and number_of_unique_values > 10 and isNameList(values_set):
            language = detect_language(values)
            name_fields.append({"name": field, "uniqueness": uniqueness, "language": language})

    # sort namefields by uniqueness
    name_fields = sorted(name_fields, key = lambda namefield: -1*namefield["uniqueness"])

    print "name_fields are", name_fields
    return name_fields
Esempio n. 27
0
def get_repo_info(full_name):
    repo = github_client.get_repo(full_name)
    # Fetch labels.
    labels = [topic for topic in repo.get_topics()]
    languages = list(repo.get_languages().items())
    if len(languages) > 0:
        main_language = languages[0][0].lower()
        if main_language not in labels:
            labels.append(main_language)
    # Fetch categories.
    categories = []
    readme = repo.get_readme().decoded_content.decode("utf-8")
    spoken_language = detect_language(readme)
    if spoken_language == "Mandarin":
        categories.append("language:zh")
    elif spoken_language == "English":
        categories.append("language:en")
    return {
        "ItemId": full_name.replace("/", ":").lower(),
        "Timestamp": str(repo.updated_at),
        "Labels": labels,
        "Categories": categories,
        "Comment": repo.description,
    }
 def test_detect_language_english_with_module_language_specification(self):
     result = detect_language(self.texts["english"], LANGUAGES)
     self.assertEqual(result.lower(), 'english')
Esempio n. 29
0
#coding=utf-8
from language_detector import detect_language
text = u"中"
language = detect_language(text)
print language 
 def test_detect_language_hindi_with_module_language_specification(self):
     result = detect_language(self.texts["hindi"], LANGUAGES)
     self.assertEqual(result.lower(), 'hindi')
 def test_detect_language_mixed_with_our_language_specification(self):
     result = detect_language(self.texts["mostly-spanish"], self.languages)
     self.assertEqual(result.lower(), 'spanish')
 def test_detect_language_mixed_with_module_language_specification(self):
     result = detect_language(self.texts["mostly-spanish"], LANGUAGES)
     self.assertEqual(result.lower(), 'spanish')
Esempio n. 33
0
def main(file, userselection, n_pois, output, desc):

    df_reviews = pd.read_csv(file).drop_duplicates(subset=['Tripadvisor'],
                                                   keep='first')
    #Lower all words
    df_reviews['description'] = df_reviews['description'].str.lower()
    #Numeric to strings
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: re.sub(r'\d+', '', x))
    #remove html tags
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: (html.unescape(x)))
    #remove punctuation
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    #remove accent
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: unidecode.unidecode(x))
    #remove specific characters and words
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: re.sub("description", '', x))
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: re.sub("wikipedia", '', x))
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: re.sub("'s", '', x))
    #stop words
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    df_reviews['tokens'] = df_reviews['description'].apply(lambda x: [
        lemmatizer.lemmatize(word) for word in word_tokenize(x)
        if not word in stop_words and detect_language(word) == 'English'
    ])

    #get corpus
    corpus = get_corpus(df_reviews)

    seg_list = split_to_words(corpus)
    vectorizer_model = CountVectorizer(stop_words=stop_words,
                                       analyzer='word',
                                       max_features=2000)
    vec_docs = vectorizer_model.fit_transform(seg_list)
    tf_feature_names = vectorizer_model.get_feature_names()

    no_topics = 10
    no_top_words = 5

    lda = LatentDirichletAllocation(n_components=no_topics,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=40.,
                                    random_state=0).fit(vec_docs)
    display_topics(lda, tf_feature_names, no_top_words)
    lda_results = lda.fit_transform(vec_docs)

    df_reviews = get_topics(lda_results, df_reviews)

    topic_dict = display_topics(lda, tf_feature_names, no_top_words)

    h = pd.DataFrame.from_dict(topic_dict, orient='index').transpose().melt()

    df_reviews = df_reviews.merge(h,
                                  left_on='topics',
                                  right_on='variable',
                                  how='left')
    df_reviews = df_reviews.drop(columns=['topics', 'variable', 'tokens'])
    df_reviews = pd.get_dummies(df_reviews,
                                prefix=['keyword'],
                                columns=['value']).drop_duplicates()
    cols = [col for col in df_reviews.columns if 'keyword' not in col]

    df_reviews = df_reviews.groupby(cols).sum().reset_index()

    df_reviews = review_rate(df_reviews)

    selection = user_selection(userselection)

    results = best_results(dict_user(df_reviews, selection),
                           df_reviews,
                           n_pois,
                           relevant=userselection)

    if output != "default":
        results.to_csv("{output}_{sufix}.csv".format(output=output,
                                                     sufix=selection),
                       index=False)
    desc_ = pd.read_csv(desc, sep="|")
    results = results.merge(desc_,
                            left_on='Tripadvisor',
                            right_on='name',
                            how='inner')
    return results
 def test_detect_language_ambiguous(self):
     text = """
         A giant dog chased seven cats.
     """
     result = detect_language(text, self.languages)
     self.assertEqual(result, {'Spanish', 'English'})
Esempio n. 35
0
def is_sent_english(sent):
    """Check if the sentence is English or not"""

    return ld.detect_language(sent) == 'english'
 def test_detect_language_english_with_module_language_specification(self):
     result = detect_language(self.texts["english"], LANGUAGES)
     self.assertEqual(result.lower(), 'english')
 def test_detect_language_german_with_module_language_specification(self):
     result = detect_language(self.texts["german"], LANGUAGES)
     self.assertEqual(result.lower(), 'german')
 def test_detect_language_german_with_our_language_specification(self):
     result = detect_language(self.texts["german"], self.languages)
     self.assertEqual(result.lower(), 'german')
 def test_detect_language_spanish_with_our_language_specification(self):
     result = detect_language(self.texts["spanish"], self.languages)
     self.assertEqual(result.lower(), 'spanish')
 def test_detect_language_english_with_our_language_specification(self):
     result = detect_language(self.texts["english"], self.languages)
     self.assertEqual(result.lower(), 'english')
 def test_detect_language_german_with_our_language_specification(self):
     result = detect_language(self.texts["german"], self.languages)
     self.assertEqual(result.lower(), 'german')
 def test_detect_language_german_with_module_language_specification(self):
     result = detect_language(self.texts["german"], LANGUAGES)
     self.assertEqual(result.lower(), 'german')
def is_line_in_english(line):
    """Check if the line is in english.
    - Input: line -string.
    - Return: true if line is in english ,false otherwise."""

    return ld.detect_language(line) == 'english'