Python TextCatの例

プログラミング言語: Python

名前空間/パッケージ名: nltk.classify.textcat

メソッド/関数: TextCat

hotexamples.comのコード掲載数: 7

Python TextCat - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのnltk.classify.textcat.TextCatの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: languageIdentification_program.py プロジェクト: FatimaZahrae814/LanguageIdentifier

def detect_language(list):
    """
    using TextCat to detect language an implementation of the text categorization algorithm
    """

    #using textcat to categorize the text
    text_cat = textcat.TextCat()

    #print language of each sentence
    for sentence in list:
        print("the sentence:\n\t '{0}' => is written in {1}".format(
            sentence, text_cat.guess_language(sentence)))

コード例 #2

ファイルを表示

ファイル: languages.py プロジェクト: travelLynz/homophily_satisfaction

def get_textcat_languages(tbl, col):
    t = tc.TextCat()
    tbl.is_copy = False
    langs = []
    for s in tbl[col]:
        try:
            l = t.guess_language(s)
            langs.append(l)
        except:
            langs.append('unk')
    tbl['textcat_langs'] = langs
    return tbl

コード例 #3

ファイルを表示

ファイル: graph.py プロジェクト: jindrvo1/ami-summarization

    "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
    "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from",
    "up", "down", "in", "out", "on", "off", "over", "under", "again",
    "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other", "some",
    "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
    "very", "s", "t", "can", "will", "just", "don", "should", "now"
]

stemmer = snowball.EnglishStemmer()

text_classifier = tc.TextCat()


# create graph where nodes are sentences and edges are present if sentences are similar
def create_graph(sentences, words):
    graph = net.Graph()
    for s in sentences:
        graph.add_node(s.raw_text)
    graph = add_edges(graph, sentences, words)
    return graph


# create an edge in case the similarity of two sentences is above certain threshold
def add_edges(graph, sentences, words):
    for s1 in sentences:
        for s2 in sentences:

コード例 #4

ファイルを表示

# Language detection tools
import fasttext
from langdetect import detect_langs
from polyglot.detect import Detector
from langid.langid import LanguageIdentifier, model
from nltk.classify import textcat
from utility import set_iso_639

# Load module for fasttext
ft_model = fasttext.load_model('lib/lid.176.bin')

# Instiantiate a langid language identifier object
langid_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

# Instiantiate a textcat language classifier
tc_cls = textcat.TextCat()


def detect_language(text, guarani=False):
    '''
    return ISO 639-1
    '''
    threshold_confidence = 0.70  # changed because gn,grn,gug is tricky, old 0.75
    lang_detected = defaultdict(int)

    if not text:
        raise Exception('Error!, text is empty.')

    # infer language using fasttext
    try:
        pred_fasttext = ft_model.predict(text, k=1)

コード例 #5

ファイルを表示

ファイル: serialize_languages.py プロジェクト: SpeciesFileGroup/cs_492_fall

from grpc_client import *
from data_formats import *
from nltk.classify import textcat
import json, re

if __name__ == "__main__":
    f_out = "data_with_languages.json"
    language_classifier = textcat.TextCat()
    journalCollections = dict()

    host = "172.22.247.23:8888"
    with grpc.insecure_channel(host) as channel:
        stub = protob_pb2_grpc.BHLIndexStub(channel)
        for title in Titles(stub):
            text = ""
            text_index = 10
            journal = Journal(title)
            for page in Pages(stub, withText=True, titles=[title.id]):
                if text_index > 0:
                    txt = page.text
                    txt = txt.replace(b'\r', b'')
                    txt = txt.replace(b'\n', b'')
                    txt = txt.decode("ascii", "ignore")
                    text += re.sub(
                        r'[^\w]', ' ',
                        txt)  # Removes all non-alphanumeric characters
                    text += " "
                journal.add_page(page)
                text_index -= 1
            # Classifying the language parameter
            journal.lang = language_classifier.guess_language(text.lower())

コード例 #6

ファイルを表示

ファイル: imdb.py プロジェクト: JauharulF/sentiment

    sql = """ SELECT id, {}_data FROM sentiment.{} ORDER BY id""".format(
        lang, table)
    cursor.execute(sql)
    rs = cursor.fetchall()
    for row in rs:
        # print('%s (%s) %s' % (row['sentence'], crubadan.iso_to_crubadan(tct.guess_language(row['sentence'])), lid.classify(row['sentence'])))
        sql = """ UPDATE sentiment.{} SET {}_textcat=%s, {}_langid=%s WHERE id=%s """.format(
            table, lang, lang)
        cursor.execute(
            sql, (crubadan.iso_to_crubadan(
                tct.guess_language(row['{}_data'.format(lang)])),
                  lid.classify(row['{}_data'.format(lang)])[0], row['id']))
        conn.commit()
        print('.', end='', flush=True)


tct = textcat.TextCat()
lid = LanguageIdentifier.from_modelstring(model, norm_probs=True)
# lid.set_languages(['en', 'ms'])

dbcon = MySQLdb.connect(host='localhost',
                        user='******',
                        passwd='123456',
                        db='sentiment',
                        cursorclass=MySQLdb.cursors.DictCursor)
# cursor = dbcon.cursor()
# process_lid(dbcon, 'imdb_train', 'en', tct, lid)
# process_lid(dbcon, 'imdb_train', 'ms', tct, lid)
# process_lid(dbcon, 'imdb_test', 'en', tct, lid)
# process_lid(dbcon, 'imdb_test', 'ms', tct, lid)

コード例 #7

ファイルを表示

ファイル: script.py プロジェクト: tusharma78/LD-AS

    'ko': 'Korean',
    'ar': 'Arabic',
    'zh': 'Chinese (Simplified)',
    'cnr': 'Montenegrin [2]',
    'zh-TW': 'Chinese (Traditional)',
    'ne': 'Nepali',
    'gu': 'Gujarati',
    'ta': 'Tamil',
    'he': 'Hebrew',
    'te': 'Telugu',
    'en': 'English'
}

Text = input(str("Enter the Text: "))

classifier = textcat.TextCat()

distances = classifier.lang_dists(Text)
# #print(input_text)
ans = classifier.guess_language(Text)

# Goslate Language Detector

# gs = goslate.Goslate()
# lan_id = gs.detect(Text)

language = lang_identifier(Text)

txt = '(ISO639-3) Code: ' + language
res = " ".join(line.get(ele, ele) for ele in txt.split())
# print("Detected Language: ",gs.get_languages()[lan_id], res)