Ejemplo n.º 1
0
    def part_of_speeach(self):

        os.environ["JAVA_HOME"] = "/usr/bin/java"
        jar = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-full-2015-12-09/stanford-postagger.jar'
        model = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-full-2015-12-09/models/arabic.tagger'
        # model = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-2011-04-20/models/left3words-wasj-0-18.tagger'
        tagger = StanfordPOSTagger(model, jar)
        tagger.java_options = '-mx4096m'  ### Setting higher memory limit for long sentences

        text = tagger.tag(word_tokenize(self.lyrics[0]))
        s = ''
        for i in text:
            f = i[0] + ' ' + i[1]
            s = s + f + ' / '
        return s
Ejemplo n.º 2
0
def postagger_nltk(word_lists):
    os.environ['JAVAHOME'] = JAVA_PATH
    os.environ["STANFORD_PARSER"] = STANFORD_POSTAGGER_PATH
    os.environ["STANFORD_MODELS"] = STANFORD_POSTAGGER_MODELS
    chinese_tagger = StanfordPOSTagger(model_filename=nltk_pos_model_filename,
                                       path_to_jar=STANFORD_POSTAGGER_PATH)
    chinese_tagger.java_options = '-mx12000m'
    nltk_all_tag = []
    flag = 1
    for sentence in word_lists:
        analyse = chinese_tagger.tag(sentence.split())
        str_tag = ""
        for tag_tuple in analyse:
            tag_list = [tag_char for tag_char in tag_tuple[1].split("#")]
            str_tag += tag_list[0] + "/" + tag_list[1] + " "
        nltk_all_tag.append(str_tag.strip())
        print("######LTP POSTagger finished " + str(flag) + " sentences")
        flag += 1
    print("NLTK POATagger is finished!!")

    return nltk_all_tag
Ejemplo n.º 3
0
    def __init__(self, file_path, tagged_words_path=None):
        '''Creates a Collocations instance with a text

        file_path - string path to .txt input file; used to generate full
            description of results in output file, whether or not tagged_words
            is given
        tagged_words_path - string path to .txt file containing string
            representation of list of tagged words in input file; saves time and
            resources on computation
        '''

        self.file_path = file_path

        if tagged_words_path == None:
            #open input file, extract text, and close file
            document = open(file_path, 'r', encoding='utf-8')
            raw = document.read().lower()
            document.close()

            #tokenize text into words and tag parts of speech using the
            #Stanford part-of-speech tagger
            sentences = nltk.sent_tokenize(raw)
            tokenized_sentences = [nltk.word_tokenize(w) for w in sentences]

            java_path = 'C:/Program Files/Java/jdk-9.0.1/bin/java.exe'
            os.environ['JAVAHOME'] = java_path
            path_to_model = ('stanford-postagger-2017-06-09/models/'
                'english-left3words-distsim.tagger')
            path_to_jar = ('stanford-postagger-2017-06-09/'
                'stanford-postagger.jar')
            tagger = StanfordPOSTagger(path_to_model, path_to_jar)
            tagger.java_options='-mx4096m'
            tagged_sentences = tagger.tag_sents(tokenized_sentences)
            self.tagged_words = sum(tagged_sentences, [])
        else:
            #load pre-tagged words
            import ast
            document = open(tagged_words_path, 'r', encoding='utf-8')
            self.tagged_words = ast.literal_eval(document.read())
            document.close()
Ejemplo n.º 4
0
from firebase_admin import credentials, firestore

cred = credentials.Certificate(
    "../DjangoBackendOne/news/newsapp-54f7c-firebase-adminsdk-wzja4-dc085fad0b.json"
)
firebase_admin.initialize_app(cred)
db = firestore.client()

jar = '../DjangoBackendOne/stanford-postagger-2018-10-16/stanford-postagger.jar'
model = '../DjangoBackendOne/stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger'
java_path = "C:/Program Files/Java/jdk1.8.0_101/bin/java.exe"

os.environ['JAVAHOME'] = java_path
nltk.internals.config_java('C:/Program Files/Java/jdk1.8.0_101/bin/java.exe')
pos_tagger = StanfordPOSTagger(model, jar)
pos_tagger.java_options = '-mx4096m'

config = {
    "apiKey": "AIzaSyBJumddViT3Y70F6vmEdP_1VMGXqEFaqgg",
    "authDomain": "newsapp-54f7c.firebaseapp.com",
    "databaseURL": "https://newsapp-54f7c.firebaseio.com",
    "projectId": "newsapp-54f7c",
    "storageBucket": "newsapp-54f7c.appspot.com",
    "messagingSenderId": "841850292385"
}

# firebase = pyrebase.initialize_app(config)

newsObjects = []
entityCount = 1
Ejemplo n.º 5
0
from nltk import word_tokenize

import os

# java_path = "/usr/local/java"
# os.environ['JAVAHOME'] = java_path
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-7-oracle-cloudera/jre"
os.environ["JAVA_HOME"] = "/usr/bin/java"

# Alternatively to setting the CLASSPATH add the jar and model via their path:
jar = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-2011-04-20/stanford-postagger.jar'
#model = 'D:\ArabNLP\stanford-postagger-2015-12-09\models\english-left3words-distsim.tagger'
model = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-2011-04-20/models/left3words-wsj-0-18.tagger'
# model = '/home/ahmad/PycharmProjects/untitled1/english-left3words-distsim.tagger'
tagger = StanfordPOSTagger(model, jar)

tagger.java_options = '-mx4096m'  ### Setting higher memory limit for long sentences

text = tagger.tag(word_tokenize(u'أنا تسلقت شجرة'))

s = ''
for i in text:
    f = i[0] + ' ' + i[1]
    s = s + f + ' / '
    # print(i)
# as_list = u"['" + u"', '".join(i) + u"']"+as_list

print s

#print(text)