Python PerceptronTagger.PerceptronTaggerの例、nltk.tag.PerceptronTagger.PerceptronTagger Pythonの例

コード例 #1

0

ファイルを表示

    def common_description(self, s0, s1):
        tagger = PerceptronTagger()
        s0_tags = tagger.tag(s0)
        s1_tags = tagger.tag(s1)

        total_dist = 0
        for word, tag in s0_tags:
            if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
                max_dist = 0
                for synset in wn.synsets(word, self.penn_to_wn(tag)):
                    desc = word_tokenize(synset.definition())
                    dist = len(list(set(s1) & set(desc)))
                    if dist > max_dist:
                        max_dist = dist
                total_dist += max_dist

        for word, tag in s1_tags:
            if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
                max_dist = 0
                for synset in wn.synsets(word, self.penn_to_wn(tag)):
                    desc = word_tokenize(synset.definition())
                    dist = len(list(set(s0) & set(desc)))
                    if dist > max_dist:
                        max_dist = dist
                total_dist += max_dist

        return total_dist

コード例 #2

0

ファイルを表示

ファイル: baseline-dictionary-parsing-DDI.py プロジェクト: RaquelLeandra/mai_ahlt

def smaller_subtree_containing_the_drugs(sentence, target_drugs):
    tree_string = nlp.annotate(sentence,
                               properties={
                                   'annotators': 'parse',
                                   'outputFormat': 'json'
                               })
    tagger = PerceptronTagger()
    best_subtree = None
    size = 9999999
    target_drugs = [dr for drug in target_drugs for dr in drug.split(' ')]
    for s in tree_string['sentences']:
        tree_parsed = Tree.fromstring(s['parse'])
        for subtree in tree_parsed.subtrees():
            #         print(subtree.pretty_print())
            leafs = subtree.leaves()
            current_size = len(leafs)
            if all_drugs_in_tree(target_drugs, leafs):
                if current_size < size:
                    best_subtree = subtree
                    size = current_size
        #                 print(subtree.leaves())

    try:
        clean = clean_sentence(best_subtree.leaves())
    except:
        clean = clean_sentence(sentence.split())
    # print('clean',clean)
    tagged = tagger.tag(clean)
    # print('tag:', tagged)
    lemmatized = preprocessor_lemmatize(tagged)
    # print('lemmatized', lemmatized)
    new_sentence = ' '.join([l for l, t in lemmatized])

    return new_sentence

コード例 #3

0

ファイルを表示

 def count_common_propper_nouns(self, s0, s1):
     tagger = PerceptronTagger()
     s0_tags = tagger.tag(s0)
     s1_tags = tagger.tag(s1)
     NNP_s0 = [values[0] for values in s0_tags if values[1] == 'NNP']
     NNP_s1 = [values[0] for values in s1_tags if values[1] == 'NNP']
     return len(set(NNP_s0) & set(NNP_s1))

コード例 #4

0

ファイルを表示

ファイル: data_cleaning.py プロジェクト: karthikraja95/MITRE_Tagging

    def CorpusListPhrase(self, matrix, stopwords):

        phrase_list = []
        grammar = r"""
            NBAR:
                {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
                
            NP:
                {<NBAR>}
                {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
        """

        tagger = PerceptronTagger()
        pos_tag = tagger.tag
        # Create phrase tree
        chunker = nltk.RegexpParser(grammar)
        for doc in matrix:
            phrase = self._flatten([
                word for word in self._getTerms(
                    chunker.parse(pos_tag(re.findall(r'\w+', str(doc)))))
                if word not in stopwords
            ])
            phrase_list.append(",".join(phrase))

        return phrase_list

コード例 #5

0

ファイルを表示

ファイル: preprocess_blogs.py プロジェクト: danieljkim0118/doc-word2vec

def extract_tokens(row, lemmatize=True, use_tag=True):
    tokenizer = WhitespaceTokenizer()
    if lemmatize:  # reduce words to lemmas
        pattern = '[().*+,?!\'\";:]*'
        token_list = list()
        if use_tag:  # use POS tags to obtain more accurate lemmas
            pos_tags = PerceptronTagger().tag(tokenizer.tokenize(row['text']))
            lemmatizer_input = map(
                lambda x: (x[0], nltk_to_wordnet.get(x[1][0])), pos_tags)
            lemmatizer = WordNetLemmatizer()
            for word, tag in lemmatizer_input:
                if word != 'urlLink' and 'http:' not in word:
                    word = word.lower()
                    if tag is None:
                        tok = lemmatizer.lemmatize(word)
                        tok = re.sub(pattern, '', tok)
                        if not tok.isdigit():
                            token_list.append(tok)
                    else:
                        tok = lemmatizer.lemmatize(word, tag)
                        tok = re.sub(pattern, '', tok)
                        if not tok.isdigit():
                            token_list.append(tok)
        else:  # do not use a tagger if not specified and speed up computation
            lemmatizer_input = tokenizer.tokenize(row['text'])
            lemmatizer = WordNetLemmatizer()
            for word in lemmatizer_input:
                if word != 'urlLink' and 'http:' not in word:
                    tok = lemmatizer.lemmatize(word.lower())
                    tok = re.sub(pattern, '', tok)
                    if not tok.isdigit():
                        token_list.append(tok)
    else:  # simply tokenize based on whitespaces
        token_list = tokenizer.tokenize(row['text'])
    return token_list

コード例 #6

0

ファイルを表示

 def train_corpus_to_tag():
     """
     Train tagger on Alpino Corpus
     :return: model tagger  <type: 'model'>
     """
     alp_tagged_sent = list(alp.tagged_sents())
     tagger = PerceptronTagger(load=False)
     tagger.train(alp_tagged_sent)
     return tagger

コード例 #7

0

ファイルを表示

ファイル: _02_FeatureExtractor.py プロジェクト: sahilsaxena21/topskillsdsportfolio

    def __init__(self, job_title_col, url_col, description_col, label_col,
                 word_col, encoded_job_title_col, indeed_file, words_file):
        '''       
        Parameters
        ----------
        job_title_col: str. column name that contains the job titles of the job postings
        url_col: str. column name that contains the urls of the job postings
        description_col: str. column name that contains the job descriptions of the job postings
        label_col: str. column name that contains the job group in set 
                        {"Data Scientist", "Machine Learning Engineer", "Data Engineer","Data Analyst", "None"}
        word_col: str. column name that contains the hard skills
        encoded_job_title_col: str. column name that contains the encoded job group
        df_indeed: pandas df. the dataframe with the scraped job postings
        df_words: pandas df. the dataframe with the hard skills
        '''
        #intialize attributes related to dataset
        self.job_title_col = job_title_col
        self.url_col = url_col
        self.description_col = description_col
        self.label_col = label_col
        self.word_col = word_col
        self.encoded_job_title_col = encoded_job_title_col

        #load the scraped files
        self.df_indeed = self._load_data(indeed_file)
        self.df_words = self._load_data(words_file)

        #initialize attributes related to extracted features
        self.job_description = None
        self.word_list = None
        self.features_list_single = []
        self.features_list_phrase = []
        self.topk_single = None
        self.topk_phrase = None
        self.topk_full = None
        self.df_single = pd.DataFrame()
        self.df_phrase = pd.DataFrame()
        self.df = pd.DataFrame()
        self.df_tools = pd.DataFrame()
        self.df = pd.DataFrame()
        self.top_tools_dict = {}

        # Initialize attributes related to keyphrase extraction
        self.grammar = self._initialize_grammar()
        self.stop = self._initialize_stopwords()
        self.text = """ initialize """
        self.tagger = PerceptronTagger()
        self.pos_tag = self.tagger.tag
        self.chunker = nltk.RegexpParser(self.grammar)
        self.taggedToks = self.pos_tag(re.findall(r'\w+', self.text))
        self.tree = self.chunker.parse(self.taggedToks)

        #perform pre-processing, feature extraction and post-processing
        self._execute_pre_processing()
        self._execute_feature_extraction()
        self._execute_post_processing()

コード例 #8

0

ファイルを表示

ファイル: test_json_serialization.py プロジェクト: Asgardian8740/Django

    def test_perceptron_tagger(self):
        tagger = PerceptronTagger(load=False)
        tagger.train(self.corpus)

        encoded = self.encoder.encode(tagger)
        decoded = self.decoder.decode(encoded)

        self.assertEqual(tagger.model.weights, decoded.model.weights)
        self.assertEqual(tagger.tagdict, decoded.tagdict)
        self.assertEqual(tagger.classes, decoded.classes)

コード例 #9

0

ファイルを表示

    def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)

コード例 #10

0

ファイルを表示

def status():
    from autogoal.contrib import ContribStatus

    try:
        from nltk.corpus import wordnet
        from nltk.corpus import sentiwordnet
        from nltk.corpus import stopwords

        from nltk.stem import RSLPStemmer

        st = RSLPStemmer()

        from nltk.tag import PerceptronTagger

        tagger = PerceptronTagger()
    except LookupError:
        return ContribStatus.RequiresDownload

    return ContribStatus.Ready

コード例 #11

0

ファイルを表示

def train_tagger(language, model_type, feature, train_sents):
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
    elif model_type == 'crf':
        tagger = CRFTagger()
        tagger.train(train_sents,
                     'taggers/{0}/{1}/crf.pickle'.format(language, feature))
    elif model_type == 'perceptron':
        tagger = PerceptronTagger(load=False)
        tagger.train(train_sents)

    return tagger

コード例 #12

0

ファイルを表示

ファイル: phrasemachine.py プロジェクト: radulucaciu/scattertext

    def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        #return pkgutil.get_data('scattertext',
        #                        'data/viz/semiotic_new.html').decode('utf-8')
        path = os.path.dirname(sys.modules['scattertext'].__file__) + '/data/'
        tokenizer_fn = path + 'punkt.english.pickle'
        tagger_fn = path + 'averaged_perceptron_tagger.pickle'
        #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
        #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)

コード例 #13

0

ファイルを表示

    def __init__(self, language, stanford=False):

        if not language:
            raise ValueError("No language specified for POS tagging")
        else:
            self._language = language

        if self._language == "eng" and stanford:
            self.model = StanfordPOSTagger(
                r'english-bidirectional-distsim.tagger')
            self.tagger = self.model.tag
        elif self._language == "eng":
            try:
                # "new" nltk with slow default behaviour through high-level API
                from nltk.tag import PerceptronTagger
                self.model = PerceptronTagger()
                self.tagger = self.model.tag
            except ImportError:
                self.model = None
                self.tagger = nltk.pos_tag
        elif self._language == "afr":
            self.model = HunposTagger(join(_MODEL_DIR, "pos-tag-model.af"),
                                      encoding='utf-8')
            self.tagger = self.model.tag
        elif self._language == "nso":
            self.model = HunposTagger(join(_MODEL_DIR,
                                           "simple-pos-tag-model.nso"),
                                      encoding='utf-8')
            self.tagger = self.model.tag
        elif self._language == "zul":
            #self.model = MarmotTagger(encoding='utf-8')
            self.model = HunposTagger(join(_MODEL_DIR,
                                           "simple-pos-tag-model.zu"),
                                      encoding='utf-8')
            self.tagger = self.model.tag
        else:
            raise ValueError(
                'Language "%s" not supported for POS tagging.\nSupply a 3 letter code form ISO-639.'
                % self._language)

コード例 #14

0

ファイルを表示

    def __init__(self,
                 df,
                 review_col,
                 truth_col,
                 copy=True,
                 analyzer=None,
                 stop_words=stopwords.words('english'),
                 pos_tag=PerceptronTagger().tag,
                 parse=RegexpParser(grammar).parse,
                 lemmatize=WordNetLemmatizer().lemmatize):

        # DataFrame stuffs
        self.df = df.copy() if copy else df
        self.review_col = review_col
        self.truth_col = truth_col

        # NLP stuffs
        self.analyzer = self.vader if analyzer is None else analyzer
        self.stop_words = stop_words
        self.pos_tag = pos_tag
        self.parse = parse
        self.lemmatize = lemmatize

コード例 #15

0

ファイルを表示

ファイル: data_loader.py プロジェクト: ljsmalbil/NGrams

    def tagger(self):
        """
        Usage:

        training_corpus = list(alp.tagged_sents())
        tagger = PerceptronTagger(load=True)

        tagger.train(training_corpus)

        #sent = 'NLTK is een goeda taal voor het leren over NLP'.split()

        print(tagger.tag(article_text.split()))
        :return:
        """

        # Load Corpus
        training_corpus = list(alp.tagged_sents())
        tagger = PerceptronTagger(load=True)

        # Build tagger
        tagger.train(training_corpus)

        return tagger.tag(self.string.split())

コード例 #16

0

ファイルを表示

ファイル: clustering_GxG_vec_spacy.py プロジェクト: kaibruijn/Master_Thesis

def main():

    training_corpus = list(alp.tagged_sents())
    global tagger
    tagger = PerceptronTagger()
    tagger.train(training_corpus)
    num = 2138
    dic = {}

    Xtrain = []
    Ytrain = []
    with open("trainGxG/GxG_News.txt") as txt:
        for line in txt:
            if line[0:8] == "<doc id=":
                Ytrain.append(line.split()[3][8])
                string=[line.split('\"')[1]]
                dic[line.split('\"')[1]] = line.split()[3][8]
            elif line[0:6] == "</doc>":
                Xtrain.append(" ".join(string))
            else:
                string.append(line)

    Xtest = []
    with open("testGxG/GxG_News.txt") as txt:
        for line in txt:
            if line[0:8] == "<doc id=":
                string=[]
            elif "</doc>" in line:
                Xtest.append(" ".join(string))
            else:
                string.append(line)

    Ytest = []
    with open("testGxG/GxG_News_gold.txt") as text:
        for line in text:
            Ytest.append(line.split()[1])

    sentences = []
    for i in Xtrain[:num]:
        sentences.append(preprocess(i))


    nlp = spacy.load('nl_core_news_sm')
    veclist = []

    for sentence in sentences:
        doc = nlp(sentence)
        vec = doc.vector 
        veclist.append(vec)

    X = np.array(veclist)

    clf = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None)
    labels = clf.fit_predict(X)
    pca = PCA(n_components=2).fit(X)
    coords = pca.transform(X)

    lst = []

    for index, sentence in enumerate(sentences):
        plt.text(coords[index].tolist()[0],coords[index].tolist()[1], str(dic[sentence.split()[0]]) + str(labels[index]) + ":" + str(sentence)[0:10], fontsize=4)
        lst.append(str(dic[sentence.split()[0]]) + str(labels[index]))

    label_colors=["red", "blue", "green", "yellow", "black", "purple", "cyan"]
    colors = [label_colors[i] for i in labels]
    plt.scatter(coords[:, 0], coords[:, 1], c=colors)
    centroids = clf.cluster_centers_
    centroid_coords = pca.transform(centroids)
    plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker="X", s=200, linewidth=2, c="#444d61")

    print(Counter(labels))

    genders = []
    for i,j in enumerate(sentences):
        if i < num:
            genders.append(dic[j.split()[0]])
    print(Counter(genders))
    print(Counter(lst))
    plt.show()

コード例 #17

0

ファイルを表示

ファイル: train.py プロジェクト: u4ece10128/Tasks

def main(file_input):
    data_df = pd.read_csv(str(file_input) + '.csv')
    data_df = shuffle(data_df)

    print("Loaded .csv file Successfully")

    print("Total Number of Samples:", data_df.shape[0])
    print("Total Number of Features:", data_df.shape[1])

    # Missing Values
    # column with maximum missing values

    def missing_value(data_df):
        while data_df.isnull().sum().values.sum() != 0:
            col_with_missing_val = (data_df.isnull().sum()).argmax()
            data_df = data_df[data_df[col_with_missing_val].notnull(
            )]  # drop corresponding rows that has NaN values
            print("Missing Values in Features:", col_with_missing_val)
        return data_df

    #  Missing Value Treatment:
    print("Missing Value Treatment : Start")
    data_df = missing_value(data_df)
    print("Missing Value Treatment : Stop")
    print("Total Number of Samples:", data_df.shape[0])
    print("Total Number of Features:", data_df.shape[1])

    # pattern matcher for candidate feature
    #  newly Added Features : Dates format, currency format, number of digits per candidate, number of separators
    # per candidate
    print("Computing Pattern Transformers: Start")

    pattern_strictlyDigits = "^[0-9]*$"
    pattern_endWithCharacters = "^\d*[\/.,@$!)(]$"  # Only digits + end with special characters
    pattern_telephone = "^0[0-9]{12}$"
    pattern_vat = "^0?[0-9]{9}$"
    pattern_date = '^[0-3]?[0-9](\/|\,|\.|\-){1}[0-9]?[0-9](\/|\,|\.|\-){1}[0-2][0-9]{1,3}$'

    pattern_currency_1 = '^[0-9]\.[0-9]+\,[0-9]*$'  # captures ddddd,dddd
    pattern_currency_2 = '^[0-9]+\,[0-9]+$'
    data_df['currency_filter'] = data_df['candidate'].str.contains(pattern_currency_1, regex=True).astype(np.int64)\
                                 | data_df['candidate'].str.contains(pattern_currency_2, regex=True).astype(np.int64)

    data_df['dates_filter'] = data_df['candidate'].str.contains(
        pattern_date, regex=True).astype(np.int64)
    data_df["Is_strictly_Digits"] = data_df["candidate"].str.contains(
        pattern_strictlyDigits, regex=True).astype(np.int64)
    data_df["endWithCharacters"] = data_df["candidate"].str.contains(
        pattern_endWithCharacters, regex=True).astype(np.int64)
    data_df["Number_of_Digits"] = data_df['candidate'].apply(
        lambda x: len(re.sub("\W", "", x)))
    data_df["Number_of_Separators"] = data_df['candidate'].apply(
        lambda x: len(re.sub("\w", "", x)))
    data_df["Length_of_Candidate"] = data_df['candidate'].apply(
        lambda x: len(x))

    # included the country code
    data_df["Telephone"] = data_df["candidate"].str.contains(
        pattern_telephone, regex=True).astype(np.int64)
    # VAT number contains 9 to 10 digits
    data_df["VATNumber"] = data_df["candidate"].str.contains(
        pattern_vat, regex=True).astype(np.int64)

    # drop blacklisted variables
    dates_index = data_df.index[data_df['dates_filter'] == 1].tolist()
    data_df = data_df.drop(index=dates_index, axis=0)
    data_df = data_df.drop("dates_filter", axis=1)
    currency_index = data_df.index[data_df['currency_filter'] == 1].tolist()
    data_df = data_df.drop(index=currency_index, axis=0)
    data_df = data_df.drop(["currency_filter"], axis=1)
    telephone_index = data_df.index[data_df['Telephone'] == 1].tolist()
    data_df = data_df.drop(index=telephone_index, axis=0)
    data_df = data_df.drop(["Telephone"], axis=1)
    vat_index = data_df.index[data_df['VATNumber'] == 1].tolist()
    data_df = data_df.drop(index=vat_index, axis=0)
    data_df = data_df.drop(["VATNumber"], axis=1)
    vat_index = data_df.index[data_df['endWithCharacters'] == 1].tolist()
    data_df = data_df.drop(index=vat_index, axis=0)
    data_df = data_df.drop(["endWithCharacters"], axis=1)

    print("Computing Pattern Transformers: Stop")

    # NLP Techniques:
    # Tokenization, Stemming, lemmatization, Frequency Distribution, Bag of words approach

    # Combine three text columns to single column - This columns contains he full text
    data_df["Text"] = data_df["line_before"] + data_df["line_at"] + data_df[
        "line_after"]

    print("Computing Context Transformers: Start")

    # Context Transformers
    def email_match(doc):
        match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc))
        if match != None:
            return 1
        else:
            return 0

    data_df["Number_of_Characters_Text"] = data_df["Text"].apply(
        lambda x: len(re.sub("[^a-z]", "", str(x))))
    data_df["Number_of_Digits_Text"] = data_df["Text"].apply(
        lambda x: len(re.sub("[^0-9]+", "", str(x))))
    data_df["Number_of_Separators_Text"] = data_df["Text"].apply(lambda x: len(
        (re.sub("[\w]+", "", str(x))).replace(" ", "")))
    data_df["Is_Email_Exists"] = data_df["Text"].apply(
        email_match)  # place 1 everywhere email found else 0
    data_df["Number_of_spaces"] = data_df["Text"].apply(
        lambda x: str(x).count(' '))  # counts number of spaces,

    # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language
    ss = SnowballStemmer("dutch", "french")

    def clean_data(doc):
        ignore = list(set(stopwords.words(
            'dutch', 'french')))  # ignore the list of stopwords
        exl_chars = list(set(string.punctuation))
        exl_chars.append('€')
        # remove email ids to avoid conflicts in vocabulary construction
        doc = re.sub("[\w\.-]+@[\w\.-]+", " ", str(doc))
        doc = re.sub("\d", " ", str(doc))
        doc = ''.join([ch for ch in doc if ch not in exl_chars])
        words = []
        for i in word_tokenize(doc):  # tokenization
            if i not in ignore:
                if len(i) >= 2:  # standalone letters do not add any value
                    i = ss.stem(i)
                    words.append(i)
        doc = ' '.join(list(set(words)))
        return doc

    print("Cleaning Text Data: Start")
    data_df["Text"] = data_df["Text"].apply(
        clean_data)  # tokenize, stem and lammetize
    print("Cleaning Text Data: Stop")

    print("Computing POS Vectors: Start")

    # training_corpus = alp.tagged_sents()
    alp_tagged_sent = list(alp.tagged_sents())
    tagger = PerceptronTagger(load=False)
    tagger.train(alp_tagged_sent)

    def count_adj(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_adj_adv = counts['adv'] + counts['adj']
        return count_adj_adv

    def count_nn(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_nn = counts['noun']
        return count_nn

    def count_verb(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_verb = counts['verb']
        return count_verb

    data_df["Adv_Adj_Count"] = data_df["Text"].apply(count_adj)
    data_df["NN_count"] = data_df["Text"].apply(count_nn)
    data_df["Verb_count"] = data_df["Text"].apply(count_verb)

    print("Computing POS Vectors: Stop")

    print("Computing Vocabulary: Start")

    # store all the words in positive class and negative in two separate lists
    docs_pos = []

    docs_pos.extend(
        word_tokenize(words) for words in data_df.Text[data_df.gold == 1])

    docs_pos = list(itertools.chain(*docs_pos))

    # Clean text data - remove words like --- iiiiiii, hhhhhccchhhh, abvwwwwwcgdccc
    for i in docs_pos:
        first_3_characters = i[:3]
        last_3_characters = i[-3:]
        if len(i) >= 3 and first_3_characters[0] == first_3_characters[
                1] == first_3_characters[2]:
            docs_pos.remove(i)
        if i in docs_pos and len(i) >= 3 and last_3_characters[
                0] == last_3_characters[1] == last_3_characters[2]:
            docs_pos.remove(i)

    print("Positve class words are stored successfully")

    all_words_pos = nltk.FreqDist(docs_pos)

    print("Computing vocabulary based on Positive Class")
    # find popular words, popular equals more than 25 times in the corpus
    popular_pos_words = []
    for i in all_words_pos.items():
        if i[1] >= 25:
            popular_pos_words.append(i[0])

    # Filter nouns from the popular positive class words
    tagged_pos_words = tagger.tag(popular_pos_words)
    filtered_tag_pos_words_nouns = []
    for word in tagged_pos_words:
        if word[1] == 'noun':
            filtered_tag_pos_words_nouns.append(word[0])
    vocab_pos = list(set(filtered_tag_pos_words_nouns))
    vocabulary = list(set(vocab_pos))

    # save vocabulary
    with open("vocab.txt", "wb") as fp:
        pickle.dump(vocabulary, fp)

    print("Computing Vocabulary: Stop")

    print("Length of Vocabulary: ", len(vocabulary))

    print("Computing Bag of Words Vectors: Start")

    def build_features(doc):
        vector = np.zeros((1, len(vocabulary)), dtype=np.int64)
        for w in word_tokenize(doc):
            for idx, vocab in enumerate(vocabulary):
                if vocab == w:
                    vector[0][idx] += 1
        return vector

    bag_vectors = data_df["Text"].apply(build_features)

    feature_vectors = np.zeros((data_df.shape[0], len(vocabulary)),
                               dtype=np.int64)
    for pos, index in enumerate(data_df.index.values):
        feature_vectors[pos, :] = bag_vectors[index]

    cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))]
    for col_index, col in enumerate(cols):
        data_df[col] = feature_vectors[:,
                                       col_index].reshape(data_df.shape[0], 1)

    print("Computing Bag of Words Vectors: Stop")

    print("Computing Context Transformers: Stop")

    print("Computing Location Transformers: Start")

    data_df["location_page_nr"] = data_df["page_nr"].apply(lambda x: 100
                                                           if x >= 50 else x)
    data_df["location_line_nr"] = data_df["line_nr"].apply(lambda x: 100
                                                           if x >= 50 else x)

    print("Computing Location Transformers: Stop")

    print("Total Number of Newly Added Features:", data_df.shape[1] - 7)

    print("Building ML - Neural Network Model: Start")

    X = data_df.drop([
        "candidate", "Text", "gold", "label", "line_after", "line_at",
        "line_before", "line_nr", "page_nr"
    ],
                     axis=1)
    y = data_df.gold
    #  Normalisation
    X = (X - X.mean(axis=0)) / X.std(axis=0)

    def build_model(input_shape):
        model = Sequential()
        model.add(Dense(1024, input_shape=(input_shape, )))
        model.add(Activation('sigmoid'))

        model.add(Dense(512))
        model.add(Activation('sigmoid'))

        model.add(Dense(128))
        model.add(Activation('sigmoid'))

        model.add(Dense(1, activation="sigmoid"))

        model.compile(optimizer='adam',
                      loss=tf.keras.losses.mean_squared_error,
                      metrics=['accuracy'])
        return model

    #  Stratified k-Fold
    k_fold_outer = model_selection.StratifiedKFold(n_splits=5)
    scores = []
    split = 0
    for train_index, test_index in k_fold_outer.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        model = build_model(X_train.shape[1])
        history = model.fit(X_train,
                            y_train,
                            epochs=5,
                            batch_size=1024,
                            verbose=1)
        results = model.evaluate(X_val, y_val)
        scores.append(results[1])
        split += 1
        del model, history, results

    model = build_model(X.shape[1])
    model.fit(X, y, verbose=0)

    print('Saving the Model *.h5...')
    model.save('model_candidate_filter.h5')

    yHat_proba = model.predict(X)
    yHat = np.copy(yHat_proba)
    yHat[yHat <= 0.5] = 0
    yHat[yHat > 0.5] = 1

    br_score = np.around(metrics.brier_score_loss(y, yHat_proba, pos_label=1),
                         decimals=5)
    print("Storing Results in .csv file")

    confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1]))
    for i in range(0, yHat_proba.shape[0]):
        if yHat_proba[i] <= 0.5:
            confidence[i] = 1 - yHat_proba[i]
        else:
            confidence[i] = yHat_proba[i]

    results_data_frame = pd.DataFrame(
        columns=["Predictions", "Confidence Level"], index=data_df.index)
    results_data_frame["Predictions"] = yHat.astype(np.int64).ravel()
    results_data_frame["Confidence Level"] = np.around(confidence, decimals=4)
    results_data_frame.to_csv("Results_predictions_confidence_train.csv",
                              encoding='utf-8',
                              header=True,
                              index=True)

    return np.mean(scores), br_score

コード例 #18

0

ファイルを表示

def labelClustersWithKeyPhrases(labels, myReader, num_clusters, n):
    top_features_list = []

    tagger = PerceptronTagger()
    pos_tag = tagger.tag
    grammar = r"""
        NBAR:
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
    """
    # Create phrase tree
    chunker = nltk.RegexpParser(grammar)

    stop = ENGLISH_STOP_WORDS

    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()

    # generator, generate leaves one by one
    def leaves(tree):
        """Finds NP (nounphrase) leaf nodes of a chunk tree."""
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP' or t.label() == 'JJ' or t.label() == 'RB'):
            yield subtree.leaves()

    # stemming, lematizing, lower case...
    def normalise(word):
        """Normalises words to lowercase and stems and lemmatizes it."""
        word = word.lower()
        word = stemmer.stem(word)
        word = lemmatizer.lemmatize(word)
        return word

    # stop-words and length control
    def acceptable_word(word):
        """Checks conditions for acceptable word: length, stopword."""
        accepted = bool(2 <= len(word) <= 40
                        and word.lower() not in stop)
        return accepted

    # generator, create item once a time
    def get_terms(tree):
        for leaf in leaves(tree):
            term = [normalise(w) for w, t in leaf if acceptable_word(w)]
            # Phrase only
            if len(term) > 1:
                yield term

    def flatten(npTokenList):
        finalList = []
        for phrase in npTokenList:
            token = ''
            for word in phrase:
                token += word + ' '
            finalList.append(token.rstrip())
        return finalList

    for cluster in range(num_clusters):
        indices = [index for index, clusterNum in enumerate(labels) if clusterNum == cluster] # indices of documents in cluster
        clusterCorpus = [doc_dict['negative_feedback'] for (docnum, doc_dict) in myReader.iter_docs() if docnum in indices] #
        clusterCorpus = ' '.join(clusterCorpus)

        counter = Counter()
        counter.update(flatten([word
                                for word
                                in get_terms(chunker.parse(pos_tag(re.findall(r'\w+', clusterCorpus))))
                                ]))

        most_common_n = counter.most_common(n)

        top_features = [feature[0] for feature in most_common_n]
        top_features_list.append(top_features)

    feature_names_df = pd.DataFrame(top_features_list, columns=['1', '2', '3', '4', '5'])

    return feature_names_df

コード例 #19

0

ファイルを表示

ファイル: test_perceptron_tagger.py プロジェクト: elaheh-sadredini/BrillPlusPlus

def run_test(my_corpus):
    if my_corpus == treebank:
        print 'Corpus Info:'
        print '  Corpus: treebank'
        print '  Tagged Sents:', len(my_corpus.tagged_sents())
        print '  Tagged Words:', len(my_corpus.tagged_words())
        my_tagged_sents = my_corpus.tagged_sents()
        my_sents = my_corpus.sents()
    elif my_corpus == brown:
        print 'Corpus Info:'
        print '  Corpus: brown'
        print '  Tagged Sents:', len(my_corpus.tagged_sents())
        print '  Tagged Words:', len(my_corpus.tagged_words())
        print '  Tagged Sents (news):', len(
            my_corpus.tagged_sents(categories='news'))
        print '  Tagged Words (news):', len(
            my_corpus.tagged_words(categories='news'))
        my_tagged_sents = my_corpus.tagged_sents(categories='news')
        my_sents = my_corpus.sents(categories='news')

        #print '  Tagged Sents :', len(my_corpus.tagged_sents())
        #print '  Tagged Words :', len(my_corpus.tagged_words())
        #my_tagged_sents = my_corpus.tagged_sents()
        #my_sents = my_corpus.sents()
    else:
        return

    fold = 5
    print 'Performing', fold, 'fold cross validation on corpus ...'
    train_accuracy = []
    test_accuracy = []
    train_runtime = []
    test_runtime = []

    for k in range(fold):
        train_data = [
            x for i, x in enumerate(my_tagged_sents) if i % fold != k
        ]
        validation_data = [
            x for i, x in enumerate(my_tagged_sents) if i % fold == k
        ]
        #test_data = [x for i, x in enumerate(my_sents) if i % fold == k]

        print 'Fold', k, ' has', len(train_data), 'train sentences and', len(
            validation_data), 'test sentences'
        perceptron_pos_tagger = PerceptronTagger(load=False)

        begin = time.time()
        perceptron_pos_tagger.train(train_data)
        end = time.time()
        train_acc = perceptron_pos_tagger.evaluate(train_data)
        train_accuracy.append(train_acc)
        train_runtime.append(end - begin)
        print '  Train accuracy =', train_acc, ' runtime =', end - begin

        begin = time.time()
        test_acc = perceptron_pos_tagger.evaluate(validation_data)
        end = time.time()
        test_accuracy.append(test_acc)
        test_runtime.append(end - begin)
        print '  Test accuracy =', test_acc, ' runtime =', end - begin

    print 'Results:'
    print '%15s %15s %15s %15s %15s' % ('Fold', 'Train-Accuracy',
                                        'Train-Runtime', 'Test-Accuracy',
                                        'Test-Runtime')
    for k in range(fold):
        print '%15d %15.3f%% %15.5f %15.3f%% %15.5f' % (
            k, train_accuracy[k] * 100, train_runtime[k],
            test_accuracy[k] * 100, test_runtime[k])

    avg_train_acc = sum(train_accuracy) / len(train_accuracy)
    avg_train_runtime = sum(train_runtime) / len(train_runtime)
    avg_test_acc = sum(test_accuracy) / len(test_accuracy)
    avg_test_runtime = sum(test_runtime) / len(test_runtime)
    print '%15s %15.3f%% %15.5f %15.3f%% %15.5f' % (
        'Average', avg_train_acc * 100, avg_train_runtime, avg_test_acc * 100,
        avg_test_runtime)
    return

コード例 #20

0

ファイルを表示

def main(file_input):
    test_data = pd.read_csv(str(file_input) + '.csv')
    # test_data = pd.read_csv(str(file_input) + '.csv', index_col='Unnamed: 0')

    print("Loaded .csv file Successfully")

    print("Missing Value Treatment : Start")
    # missing values Treatment
    while test_data.isnull().sum().values.sum() != 0:
        col_with_missing_val = (test_data.isnull().sum()).argmax()
        test_data = test_data[test_data[col_with_missing_val].notnull(
        )]  # drop corresponding rows that has NaN values
        print(col_with_missing_val)

    print("Missing Value Treatment : Stop")
    print("Total Number of Samples:", test_data.shape[0])
    print("Total Number of Features:", test_data.shape[1])

    print("Computing Pattern Transformers: Start")
    # pattern transformers
    pattern_strictlyDigits = "^[0-9]*$"
    test_data["strictly_Digits"] = test_data["candidate"].str.contains(
        pattern_strictlyDigits, regex=True).astype(np.int64)
    test_data["Number_of_Digits"] = test_data['candidate'].apply(
        lambda x: len(re.sub("\W", "", x)))
    test_data["Number_of_Seprators"] = test_data['candidate'].apply(
        lambda x: len(re.sub("\w", "", x)))
    test_data["Length_of_Candidate"] = test_data['candidate'].apply(
        lambda x: len(x))

    print("Computing Pattern Transformers: Stop")
    print("Computing Context Transformers: Start")
    # context transformers
    test_data["Text"] = test_data["line_before"] + test_data[
        "line_at"] + test_data["line_after"]

    def email_match(doc):
        match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc))
        if match != None:
            return 1
        else:
            return 0

    test_data["Number_of_Characters_Text"] = test_data["Text"].apply(
        lambda x: len(re.sub("[^a-z]", "", str(x))))
    test_data["Number_of_Digits_Text"] = test_data["Text"].apply(
        lambda x: len(re.sub("[^0-9]+", "", str(x))))
    test_data["Number_of_Separators_Text"] = test_data["Text"].apply(
        lambda x: len((re.sub("[\w]+", "", str(x))).replace(" ", "")))
    test_data["Email_Exists"] = test_data["Text"].apply(
        email_match)  # place 1 everywhere email found else 0
    test_data["Number_of_spaces"] = test_data["Text"].apply(
        lambda x: str(x).count(' '))  # counts number of spaces

    # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language
    ss = SnowballStemmer("dutch", "french")

    def clean_data(doc):
        ignore = list(set(stopwords.words(
            'dutch', 'french')))  # ignore the list of stopwords
        exl_chars = list(set(string.punctuation))
        exl_chars.append('€')
        doc = re.sub(
            "[\w\.-]+@[\w\.-]+", " ", str(doc)
        )  # remove email ids to avoid confiltcs in vaocabulary construction
        doc = re.sub("\d", " ", str(doc))
        doc = ''.join([ch for ch in doc if ch not in exl_chars])
        words = []
        for i in word_tokenize(doc):  # tokenization
            if i not in ignore:
                if len(i) >= 2:  # standalone letters do not add any value
                    i = ss.stem(i)
                    words.append(i)
        doc = ' '.join(list(set(words)))
        return doc

    test_data["Text"] = test_data["Text"].apply(
        clean_data)  # tokenize, stem and lammetize

    # training_corpus = alp.tagged_sents()
    alp_tagged_sent = list(alp.tagged_sents())
    tagger = PerceptronTagger(load=False)
    tagger.train(alp_tagged_sent)

    def count_adj(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_adj_adv = counts['adv'] + counts['adj']
        return count_adj_adv

    def count_nn(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_nn = counts['noun']
        return count_nn

    def count_verb(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_verb = counts['verb']
        return count_verb

    test_data["Adv_Adj_Count"] = test_data["Text"].apply(count_adj)
    test_data["NN_count"] = test_data["Text"].apply(count_nn)
    test_data["Verb_count"] = test_data["Text"].apply(count_verb)

    print("Computing Context Transformers: Stop")
    # load the vocabulary
    with open("vocab.txt", "rb") as fp:
        vocabulary = pickle.load(fp)

    print("Computing Bag of Words Vectors: Start")

    def build_features(doc):
        vector = np.zeros((1, len(vocabulary)), dtype=np.int64)
        for w in word_tokenize(doc):
            for i, word in enumerate(vocabulary):
                if word == w:
                    vector[0][i] += 1
        return vector

    bag_vectors = test_data["Text"].apply(build_features)
    feature_vectors = np.zeros((test_data.shape[0], len(vocabulary)),
                               dtype=np.int64)
    for pos, index in enumerate(test_data.index.values):
        feature_vectors[pos, :] = bag_vectors[index]
    cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))]
    for col_index, col in enumerate(cols):
        test_data[col] = feature_vectors[:, col_index].reshape(
            test_data.shape[0], 1)

    print("Computing Bag of Words Vectors: Stop")

    print("Computing Location Transformers: Start")

    test_data["location_page_nr"] = test_data["page_nr"].apply(
        lambda x: 100 if x >= 50 else x)
    test_data["location_line_nr"] = test_data["line_nr"].apply(
        lambda x: 100 if x >= 50 else x)

    print("Computing Location Transformers: Stop")

    print("Loading Model...")
    model = tf.keras.models.load_model('model_candidate_filter.h5')
    model.compile(loss=tf.keras.losses.mean_squared_error,
                  optimizer='adam',
                  metrics=['accuracy'])
    print("Loaded Model Successfully!")

    X_test = test_data.drop([
        "candidate", "Text", "label", "line_after", "line_at", "line_before",
        "page_nr", "line_nr"
    ],
                            axis=1)

    X_test = (X_test - X_test.mean(axis=0)) / X_test.std(axis=0)
    yHat_proba = model.predict(X_test)
    yHat = np.copy(yHat_proba)
    yHat[yHat <= 0.5] = 0
    yHat[yHat > 0.5] = 1

    print("Storing Results in .csv file")

    confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1]))
    for i in range(0, yHat_proba.shape[0]):
        if yHat_proba[i] <= 0.5:
            confidence[i] = 1 - yHat_proba[i]
        else:
            confidence[i] = yHat_proba[i]

    results_data_frame = pd.DataFrame(
        columns=["Predictions", "Confidence Level"], index=test_data.index)
    results_data_frame["Predictions"] = yHat.astype(np.int64).ravel()
    results_data_frame["Confidence Level"] = np.around(confidence, decimals=4)
    results_data_frame.to_csv("Results_predictions_confidence_run.csv",
                              encoding='utf-8',
                              header=True,
                              index=True)

コード例 #21

0

ファイルを表示

 def __init__(self):
     nltk.download('averaged_perceptron_tagger')
     self.tagger = PerceptronTagger()
     self.lemmatizer = WordNetLemmatizer()
     self.stopwords = list(stopwords.words('english'))
     self.auto_correct_remaining = 0

コード例 #22

0

ファイルを表示

 def count_verbs(self, s0):
     tagger = PerceptronTagger()
     s0_tags = tagger.tag(s0)
     V_s0 = [values[0] for values in s0_tags if values[1] == 'VBP']
     return len(V_s0)

コード例 #23

0

ファイルを表示

 def count_nouns(self, s0):
     tagger = PerceptronTagger()
     s0_tags = tagger.tag(s0)
     NN_s0 = [values[0] for values in s0_tags if values[1] == 'NN']
     return len(NN_s0)

コード例 #24

0

ファイルを表示

ファイル: architecture.py プロジェクト: matejMartinc/CLIN29

import numpy as np
import torch
from torch.autograd import Variable
import pickle
from collections import Counter
from torch import nn
import torch.nn.functional as F
from nltk.tag import PerceptronTagger
from nltk.corpus import alpino as alp
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import PunktSentenceTokenizer
training_corpus = list(alp.tagged_sents())
tagger = PerceptronTagger(load=True)
tagger.train(training_corpus)
wordTokenizer = WordPunctTokenizer()
sentTokenizer = PunktSentenceTokenizer()


def generate_vocabulary(data, vocabulary_size):
    all_data = " ".join(data)
    print(all_data[:100])
    words = [
        word for sent in sentTokenizer.tokenize(all_data)
        for word in wordTokenizer.tokenize(sent)
    ]
    counter = Counter(words)

    # most_common() produces k frequently encountered
    # input values and their respective counts.
    most_common = counter.most_common(vocabulary_size)
    vocabulary = set([word for word, count in most_common])

コード例 #25

0

ファイルを表示

 def __init__(self):
     super(CountAdjectives, self).__init__()
     self.tagger = PerceptronTagger(load=True)
     training_corpus = list(alpino.tagged_sents())
     self.tagger.train(training_corpus)

コード例 #26

0

ファイルを表示

ファイル: keyphrase_extractor.py プロジェクト: jay7591/SocialMediaAnalyzer

    def get_keyphrases(self, textInput, min_freq=2):

        # setting up tagger
        # (from http://stackoverflow.com/a/35964709)
        PICKLE = "averaged_perceptron_tagger.pickle"
        AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE))
        tagger = PerceptronTagger(load=False)
        tagger.load(AP_MODEL_LOC)

        lemmatizer = nltk.WordNetLemmatizer()
        stemmer = nltk.stem.porter.PorterStemmer()

        # This grammar is described in the paper by S. N. Kim,
        # T. Baldwin, and M.-Y. Kan.
        # Evaluating n-gram based evaluation metrics for automatic
        # keyphrase extraction.
        # Technical report, University of Melbourne, Melbourne 2010.

        StopWords = stopwords.words('english')

        def leaves(tree):
            """Finds NP (nounphrase) leaf nodes of a chunk tree."""
            for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
                yield subtree.leaves()

        def acceptable_word(word):
            """Checks conditions for acceptable word: length, stopword."""
            accepted = bool(2 < len(word) and word.lower() not in StopWords)
            return accepted

        def normalise(word):
            """Normalises words to lowercase and stems and lemmatizes it."""
            word = word.lower()
            word = stemmer.stem(word)
            word = lemmatizer.lemmatize(word)
            return word

        def get_terms(tree):
            for leaf in leaves(tree):
                # can modify normalise to w.lower() if dont want to normalize word
                term = [normalise(w) for w, t in leaf if acceptable_word(w)]
                yield term

        def get_nounPhrases(textInput, minWordLength=2):

            grammar = r"""

            NBAR:
                {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

            NP:
                {<NBAR>}
                {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
                      """

            chunker = nltk.RegexpParser(grammar)

            toks = nltk.word_tokenize(textInput)
            # print(toks)
            pos_tag = tagger.tag
            postoks = pos_tag(toks)

            tree = chunker.parse(postoks)
            terms = get_terms(tree)

            nounPhraseList = []
            for tid, term in enumerate(terms):
                templist = []
                for wid, word in enumerate(term):
                    # print("TID: ",tid," WID: ",(wid+1), word)
                    templist.append(word)

                s = " "
                nounPhraseList.append(s.join(templist))

            nounPhraseList = [word for word in nounPhraseList if len(word.split()) >= minWordLength]
            return nounPhraseList

        counter = Counter()
        for nounPhrase in get_nounPhrases(textInput):
            # print(nounPhrase)
            counter.update([nounPhrase])

        keyphraseDF = pandas.DataFrame([[key, value] for key, value in counter.items() if value>=min_freq],
                                       columns=['keyphrase_stemmed', 'frequency'])
        (docsDF, occurrenceDF) = self.get_occurrence(keyphraseDF)
        print("docs", docsDF)
        print("keys", keyphraseDF)
        keyphraseDF = keyphraseDF.join(docsDF["docs"])
        print(occurrenceDF)
        keyphraseDF = keyphraseDF.join(self.get_fullphrases(keyphraseDF=keyphraseDF)["keyphrase_full"])
        keyphraseDF = keyphraseDF.join(self.get_MIs(occurrenceDF=occurrenceDF)["MI"])
        keyphraseDF = keyphraseDF.join(
            self.get_PMIs(occurrenceDF=occurrenceDF, metric="sentiment_class", value="positive")["PMI_pos"])
        keyphraseDF = keyphraseDF.join(
            self.get_PMIs(occurrenceDF=occurrenceDF, metric="sentiment_class", value="negative")["PMI_neg"])
        #keyphraseDF = keyphraseDF.join(self.get_PMIs(keyphraseDF["Keyphrase_stemmed"].tolist(), "neg"))

        return keyphraseDF

コード例 #27

0

ファイルを表示

from nltk.tag import PerceptronTagger
from nltk.data import find
import glob


#code for loading perceptron tagger 
PICKLE = "averaged_perceptron_tagger.pickle"
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
tagger = PerceptronTagger(load=False)
tagger.load(AP_MODEL_LOC)
pos_tag = tagger.tag


#list to store POS and NP lists generated from each file
GlobalPOSList=[]
GlobalNPList=[]

#getting filenames of dataset files
fileList=glob.glob("C:/Users/Vinod Chhapariya/Desktop/TDBMS/Benchmark Dataset/*.txt")

#printing filenames
for filename in fileList:
        print filename

#POS tagging using Preceptron tagger        
for filename in fileList:
        POSList=[]
        NPList=[]
        filePOSTagWrite=open(filename+"_POSTag_Perceptron",'w')
        for line in open(filename,'r').readlines():
                tags=pos_tag(line.split())

コード例 #28

0

ファイルを表示

ファイル: parse.py プロジェクト: guidocnossen/Text-Processing

		#sentence= sentence.rstrip()
		doc = nlp(sentence)
		for token in doc:
			dependency = [token.text, token.dep_,
         	token.shape_, token.is_alpha, token.is_stop,[child for child in token.children]]
			
			
			if dependency[0] == "\n":
				whole_sen.append(parsed)
				parsed=[]

			else:
				parsed.append(dependency)

frysian=[]
tagger = PerceptronTagger()
with open('frysian_data.txt', 'r',  encoding="utf-8") as fr_infile:
	for sentence in fr_infile:
		sentence = word_tokenize(sentence)
		pos = tagger.tag(sentence)
		
		fr.append(pos)
other=[]
final =[]
fr_longer=[]
for k in range(len(fr)):
	fries = fr[k]
	parsed = whole_sen[k]
	if len(fries) == len(parsed):
		for words, fr_words in zip(parsed,fries):
			print(words[0])

コード例 #29

0

ファイルを表示

ファイル: MSAI_v1.py プロジェクト: Suvajit26/Question_Answer_Classification

class text_clean:
    def __init__(self):
        pass

    def punctuation_trimming(self, sent):
        y = [x for x in sent if x not in string.punctuation]
        return y

    def special_char_removal(self, tok):

        z = [re.sub('[^A-Za-z0-9]+', '', token) for token in tok]
        z = [x for x in z if x]
        return z

    #Remove stop words
    stop_words = set(stopwords.words('english'))

    def stopw_rem(self, tok):

        clean_tokens = tok[:]
        for token in tok:
            if token in self.stop_words:
                clean_tokens.remove(token)
        return clean_tokens

    #Convert to lower case
    def conv_to_lower(slef, sent):
        newtok = [item.lower() for item in sent]
        return newtok

    #POS tagger

    PICKLE = "averaged_perceptron_tagger.pickle"
    AP_MODEL_LOC = 'file:' + str(
        find('taggers/averaged_perceptron_tagger/' + PICKLE))
    tagger = PerceptronTagger(load=False)
    tagger.load(AP_MODEL_LOC)
    pos_tag = tagger.tag

    #Extract Noun
    def noun_iden(self, sent):
        tok = word_tokenize(sent)
        nountok = [
            word for (word, pos) in self.pos_tag(tok) if pos[:2] == 'NN'
        ]
        return nountok

    # Identify the POS and lemmatize according using the parameter pos in lemmatization
    ### Lemmatization

    lemmatizer = WordNetLemmatizer()

    def lemm(self, sent):
        tok = word_tokenize(sent)
        tok2 = []
        for word, tag in self.pos_tag(tok):
            if tag.startswith("NN"):
                temp = self.lemmatizer.lemmatize(word, pos='n')
            elif tag.startswith('VB'):
                temp = self.lemmatizer.lemmatize(word, pos='v')
            elif tag.startswith('JJ'):
                temp = self.lemmatizer.lemmatize(word, pos='a')
            else:
                temp = word
            tok2.append(temp)
        return ' '.join(tok2)

    ### Stemming
    ps = PorterStemmer()

    def stem(self, sent):
        newtok = [self.ps.stem(w) for w in word_tokenize(sent)]
        return ' '.join(newtok)

コード例 #30

0

ファイルを表示

ファイル: spectral_original3.py プロジェクト: karnova/reqnoise

from collections import Counter
#count_good_raw = Counter(good_raw)
count_good_actors = Counter(good_actors)
count_good_actions = Counter(good_actions)
#number of statements
nos = len(tokenized_actions)
#number of good actors
noga = len(count_good_actors)
#number of good actors
nogc = len(count_good_actions)

PICKLE = "taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle"
import nltk.data
from nltk.tag import PerceptronTagger
_nltk_pos_tagger = PerceptronTagger(load=False)
_nltk_pos_tagger.load(PICKLE)
print(count_good_actors)
S = np.zeros(shape=(nos, noga + nogc))
i = 0
for sent_pos in tokenized_actors:
    for token1 in sent_pos:
        j = 0
        tt1 = _nltk_pos_tagger.tag([token1])
        for feature in count_good_actors:
            ft = _nltk_pos_tagger.tag([feature])
            simval = word_sim(tt1[0], ft[0], i)
            S[i][j] = S[i][j] + simval
            j = j + 1
    i = i + 1