def detect_language(list): """ using TextCat to detect language an implementation of the text categorization algorithm """ #using textcat to categorize the text text_cat = textcat.TextCat() #print language of each sentence for sentence in list: print("the sentence:\n\t '{0}' => is written in {1}".format( sentence, text_cat.guess_language(sentence)))
def get_textcat_languages(tbl, col): t = tc.TextCat() tbl.is_copy = False langs = [] for s in tbl[col]: try: l = t.guess_language(s) langs.append(l) except: langs.append('unk') tbl['textcat_langs'] = langs return tbl
"theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" ] stemmer = snowball.EnglishStemmer() text_classifier = tc.TextCat() # create graph where nodes are sentences and edges are present if sentences are similar def create_graph(sentences, words): graph = net.Graph() for s in sentences: graph.add_node(s.raw_text) graph = add_edges(graph, sentences, words) return graph # create an edge in case the similarity of two sentences is above certain threshold def add_edges(graph, sentences, words): for s1 in sentences: for s2 in sentences:
# Language detection tools import fasttext from langdetect import detect_langs from polyglot.detect import Detector from langid.langid import LanguageIdentifier, model from nltk.classify import textcat from utility import set_iso_639 # Load module for fasttext ft_model = fasttext.load_model('lib/lid.176.bin') # Instiantiate a langid language identifier object langid_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) # Instiantiate a textcat language classifier tc_cls = textcat.TextCat() def detect_language(text, guarani=False): ''' return ISO 639-1 ''' threshold_confidence = 0.70 # changed because gn,grn,gug is tricky, old 0.75 lang_detected = defaultdict(int) if not text: raise Exception('Error!, text is empty.') # infer language using fasttext try: pred_fasttext = ft_model.predict(text, k=1)
from grpc_client import * from data_formats import * from nltk.classify import textcat import json, re if __name__ == "__main__": f_out = "data_with_languages.json" language_classifier = textcat.TextCat() journalCollections = dict() host = "172.22.247.23:8888" with grpc.insecure_channel(host) as channel: stub = protob_pb2_grpc.BHLIndexStub(channel) for title in Titles(stub): text = "" text_index = 10 journal = Journal(title) for page in Pages(stub, withText=True, titles=[title.id]): if text_index > 0: txt = page.text txt = txt.replace(b'\r', b'') txt = txt.replace(b'\n', b'') txt = txt.decode("ascii", "ignore") text += re.sub( r'[^\w]', ' ', txt) # Removes all non-alphanumeric characters text += " " journal.add_page(page) text_index -= 1 # Classifying the language parameter journal.lang = language_classifier.guess_language(text.lower())
sql = """ SELECT id, {}_data FROM sentiment.{} ORDER BY id""".format( lang, table) cursor.execute(sql) rs = cursor.fetchall() for row in rs: # print('%s (%s) %s' % (row['sentence'], crubadan.iso_to_crubadan(tct.guess_language(row['sentence'])), lid.classify(row['sentence']))) sql = """ UPDATE sentiment.{} SET {}_textcat=%s, {}_langid=%s WHERE id=%s """.format( table, lang, lang) cursor.execute( sql, (crubadan.iso_to_crubadan( tct.guess_language(row['{}_data'.format(lang)])), lid.classify(row['{}_data'.format(lang)])[0], row['id'])) conn.commit() print('.', end='', flush=True) tct = textcat.TextCat() lid = LanguageIdentifier.from_modelstring(model, norm_probs=True) # lid.set_languages(['en', 'ms']) dbcon = MySQLdb.connect(host='localhost', user='******', passwd='123456', db='sentiment', cursorclass=MySQLdb.cursors.DictCursor) # cursor = dbcon.cursor() # process_lid(dbcon, 'imdb_train', 'en', tct, lid) # process_lid(dbcon, 'imdb_train', 'ms', tct, lid) # process_lid(dbcon, 'imdb_test', 'en', tct, lid) # process_lid(dbcon, 'imdb_test', 'ms', tct, lid)
'ko': 'Korean', 'ar': 'Arabic', 'zh': 'Chinese (Simplified)', 'cnr': 'Montenegrin [2]', 'zh-TW': 'Chinese (Traditional)', 'ne': 'Nepali', 'gu': 'Gujarati', 'ta': 'Tamil', 'he': 'Hebrew', 'te': 'Telugu', 'en': 'English' } Text = input(str("Enter the Text: ")) classifier = textcat.TextCat() distances = classifier.lang_dists(Text) # #print(input_text) ans = classifier.guess_language(Text) # Goslate Language Detector # gs = goslate.Goslate() # lan_id = gs.detect(Text) language = lang_identifier(Text) txt = '(ISO639-3) Code: ' + language res = " ".join(line.get(ele, ele) for ele in txt.split()) # print("Detected Language: ",gs.get_languages()[lan_id], res)