def detect_language(list):
    """
    using TextCat to detect language an implementation of the text categorization algorithm
    """

    #using textcat to categorize the text
    text_cat = textcat.TextCat()

    #print language of each sentence
    for sentence in list:
        print("the sentence:\n\t '{0}' => is written in {1}".format(
            sentence, text_cat.guess_language(sentence)))
コード例 #2
0
def get_textcat_languages(tbl, col):
    t = tc.TextCat()
    tbl.is_copy = False
    langs = []
    for s in tbl[col]:
        try:
            l = t.guess_language(s)
            langs.append(l)
        except:
            langs.append('unk')
    tbl['textcat_langs'] = langs
    return tbl
コード例 #3
0
ファイル: graph.py プロジェクト: jindrvo1/ami-summarization
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
    "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from",
    "up", "down", "in", "out", "on", "off", "over", "under", "again",
    "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other", "some",
    "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
    "very", "s", "t", "can", "will", "just", "don", "should", "now"
]

stemmer = snowball.EnglishStemmer()

text_classifier = tc.TextCat()


# create graph where nodes are sentences and edges are present if sentences are similar
def create_graph(sentences, words):
    graph = net.Graph()
    for s in sentences:
        graph.add_node(s.raw_text)
    graph = add_edges(graph, sentences, words)
    return graph


# create an edge in case the similarity of two sentences is above certain threshold
def add_edges(graph, sentences, words):
    for s1 in sentences:
        for s2 in sentences:
コード例 #4
0
# Language detection tools
import fasttext
from langdetect import detect_langs
from polyglot.detect import Detector
from langid.langid import LanguageIdentifier, model
from nltk.classify import textcat
from utility import set_iso_639

# Load module for fasttext
ft_model = fasttext.load_model('lib/lid.176.bin')

# Instiantiate a langid language identifier object
langid_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

# Instiantiate a textcat language classifier
tc_cls = textcat.TextCat()


def detect_language(text, guarani=False):
    '''
    return ISO 639-1
    '''
    threshold_confidence = 0.70  # changed because gn,grn,gug is tricky, old 0.75
    lang_detected = defaultdict(int)

    if not text:
        raise Exception('Error!, text is empty.')

    # infer language using fasttext
    try:
        pred_fasttext = ft_model.predict(text, k=1)
コード例 #5
0
from grpc_client import *
from data_formats import *
from nltk.classify import textcat
import json, re

if __name__ == "__main__":
    f_out = "data_with_languages.json"
    language_classifier = textcat.TextCat()
    journalCollections = dict()

    host = "172.22.247.23:8888"
    with grpc.insecure_channel(host) as channel:
        stub = protob_pb2_grpc.BHLIndexStub(channel)
        for title in Titles(stub):
            text = ""
            text_index = 10
            journal = Journal(title)
            for page in Pages(stub, withText=True, titles=[title.id]):
                if text_index > 0:
                    txt = page.text
                    txt = txt.replace(b'\r', b'')
                    txt = txt.replace(b'\n', b'')
                    txt = txt.decode("ascii", "ignore")
                    text += re.sub(
                        r'[^\w]', ' ',
                        txt)  # Removes all non-alphanumeric characters
                    text += " "
                journal.add_page(page)
                text_index -= 1
            # Classifying the language parameter
            journal.lang = language_classifier.guess_language(text.lower())
コード例 #6
0
ファイル: imdb.py プロジェクト: JauharulF/sentiment
    sql = """ SELECT id, {}_data FROM sentiment.{} ORDER BY id""".format(
        lang, table)
    cursor.execute(sql)
    rs = cursor.fetchall()
    for row in rs:
        # print('%s (%s) %s' % (row['sentence'], crubadan.iso_to_crubadan(tct.guess_language(row['sentence'])), lid.classify(row['sentence'])))
        sql = """ UPDATE sentiment.{} SET {}_textcat=%s, {}_langid=%s WHERE id=%s """.format(
            table, lang, lang)
        cursor.execute(
            sql, (crubadan.iso_to_crubadan(
                tct.guess_language(row['{}_data'.format(lang)])),
                  lid.classify(row['{}_data'.format(lang)])[0], row['id']))
        conn.commit()
        print('.', end='', flush=True)


tct = textcat.TextCat()
lid = LanguageIdentifier.from_modelstring(model, norm_probs=True)
# lid.set_languages(['en', 'ms'])

dbcon = MySQLdb.connect(host='localhost',
                        user='******',
                        passwd='123456',
                        db='sentiment',
                        cursorclass=MySQLdb.cursors.DictCursor)
# cursor = dbcon.cursor()
# process_lid(dbcon, 'imdb_train', 'en', tct, lid)
# process_lid(dbcon, 'imdb_train', 'ms', tct, lid)
# process_lid(dbcon, 'imdb_test', 'en', tct, lid)
# process_lid(dbcon, 'imdb_test', 'ms', tct, lid)
コード例 #7
0
ファイル: script.py プロジェクト: tusharma78/LD-AS
    'ko': 'Korean',
    'ar': 'Arabic',
    'zh': 'Chinese (Simplified)',
    'cnr': 'Montenegrin [2]',
    'zh-TW': 'Chinese (Traditional)',
    'ne': 'Nepali',
    'gu': 'Gujarati',
    'ta': 'Tamil',
    'he': 'Hebrew',
    'te': 'Telugu',
    'en': 'English'
}

Text = input(str("Enter the Text: "))

classifier = textcat.TextCat()

distances = classifier.lang_dists(Text)
# #print(input_text)
ans = classifier.guess_language(Text)

# Goslate Language Detector

# gs = goslate.Goslate()
# lan_id = gs.detect(Text)

language = lang_identifier(Text)

txt = '(ISO639-3) Code: ' + language
res = " ".join(line.get(ele, ele) for ele in txt.split())
# print("Detected Language: ",gs.get_languages()[lan_id], res)