def detect_language(list):
    """
    using TextCat to detect language an implementation of the text categorization algorithm
    """

    #using textcat to categorize the text
    text_cat = textcat.TextCat()

    #print language of each sentence
    for sentence in list:
        print("the sentence:\n\t '{0}' => is written in {1}".format(
            sentence, text_cat.guess_language(sentence)))
Ejemplo n.º 2
0
def get_textcat_languages(tbl, col):
    t = tc.TextCat()
    tbl.is_copy = False
    langs = []
    for s in tbl[col]:
        try:
            l = t.guess_language(s)
            langs.append(l)
        except:
            langs.append('unk')
    tbl['textcat_langs'] = langs
    return tbl
Ejemplo n.º 3
0
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
    "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from",
    "up", "down", "in", "out", "on", "off", "over", "under", "again",
    "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other", "some",
    "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
    "very", "s", "t", "can", "will", "just", "don", "should", "now"
]

stemmer = snowball.EnglishStemmer()

text_classifier = tc.TextCat()


# create graph where nodes are sentences and edges are present if sentences are similar
def create_graph(sentences, words):
    graph = net.Graph()
    for s in sentences:
        graph.add_node(s.raw_text)
    graph = add_edges(graph, sentences, words)
    return graph


# create an edge in case the similarity of two sentences is above certain threshold
def add_edges(graph, sentences, words):
    for s1 in sentences:
        for s2 in sentences:
Ejemplo n.º 4
0
# Language detection tools
import fasttext
from langdetect import detect_langs
from polyglot.detect import Detector
from langid.langid import LanguageIdentifier, model
from nltk.classify import textcat
from utility import set_iso_639

# Load module for fasttext
ft_model = fasttext.load_model('lib/lid.176.bin')

# Instiantiate a langid language identifier object
langid_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

# Instiantiate a textcat language classifier
tc_cls = textcat.TextCat()


def detect_language(text, guarani=False):
    '''
    return ISO 639-1
    '''
    threshold_confidence = 0.70  # changed because gn,grn,gug is tricky, old 0.75
    lang_detected = defaultdict(int)

    if not text:
        raise Exception('Error!, text is empty.')

    # infer language using fasttext
    try:
        pred_fasttext = ft_model.predict(text, k=1)
from grpc_client import *
from data_formats import *
from nltk.classify import textcat
import json, re

if __name__ == "__main__":
    f_out = "data_with_languages.json"
    language_classifier = textcat.TextCat()
    journalCollections = dict()

    host = "172.22.247.23:8888"
    with grpc.insecure_channel(host) as channel:
        stub = protob_pb2_grpc.BHLIndexStub(channel)
        for title in Titles(stub):
            text = ""
            text_index = 10
            journal = Journal(title)
            for page in Pages(stub, withText=True, titles=[title.id]):
                if text_index > 0:
                    txt = page.text
                    txt = txt.replace(b'\r', b'')
                    txt = txt.replace(b'\n', b'')
                    txt = txt.decode("ascii", "ignore")
                    text += re.sub(
                        r'[^\w]', ' ',
                        txt)  # Removes all non-alphanumeric characters
                    text += " "
                journal.add_page(page)
                text_index -= 1
            # Classifying the language parameter
            journal.lang = language_classifier.guess_language(text.lower())
Ejemplo n.º 6
0
    sql = """ SELECT id, {}_data FROM sentiment.{} ORDER BY id""".format(
        lang, table)
    cursor.execute(sql)
    rs = cursor.fetchall()
    for row in rs:
        # print('%s (%s) %s' % (row['sentence'], crubadan.iso_to_crubadan(tct.guess_language(row['sentence'])), lid.classify(row['sentence'])))
        sql = """ UPDATE sentiment.{} SET {}_textcat=%s, {}_langid=%s WHERE id=%s """.format(
            table, lang, lang)
        cursor.execute(
            sql, (crubadan.iso_to_crubadan(
                tct.guess_language(row['{}_data'.format(lang)])),
                  lid.classify(row['{}_data'.format(lang)])[0], row['id']))
        conn.commit()
        print('.', end='', flush=True)


tct = textcat.TextCat()
lid = LanguageIdentifier.from_modelstring(model, norm_probs=True)
# lid.set_languages(['en', 'ms'])

dbcon = MySQLdb.connect(host='localhost',
                        user='******',
                        passwd='123456',
                        db='sentiment',
                        cursorclass=MySQLdb.cursors.DictCursor)
# cursor = dbcon.cursor()
# process_lid(dbcon, 'imdb_train', 'en', tct, lid)
# process_lid(dbcon, 'imdb_train', 'ms', tct, lid)
# process_lid(dbcon, 'imdb_test', 'en', tct, lid)
# process_lid(dbcon, 'imdb_test', 'ms', tct, lid)
Ejemplo n.º 7
0
    'ko': 'Korean',
    'ar': 'Arabic',
    'zh': 'Chinese (Simplified)',
    'cnr': 'Montenegrin [2]',
    'zh-TW': 'Chinese (Traditional)',
    'ne': 'Nepali',
    'gu': 'Gujarati',
    'ta': 'Tamil',
    'he': 'Hebrew',
    'te': 'Telugu',
    'en': 'English'
}

Text = input(str("Enter the Text: "))

classifier = textcat.TextCat()

distances = classifier.lang_dists(Text)
# #print(input_text)
ans = classifier.guess_language(Text)

# Goslate Language Detector

# gs = goslate.Goslate()
# lan_id = gs.detect(Text)

language = lang_identifier(Text)

txt = '(ISO639-3) Code: ' + language
res = " ".join(line.get(ele, ele) for ele in txt.split())
# print("Detected Language: ",gs.get_languages()[lan_id], res)