Exemple #1
0
    def common_description(self, s0, s1):
        tagger = PerceptronTagger()
        s0_tags = tagger.tag(s0)
        s1_tags = tagger.tag(s1)

        total_dist = 0
        for word, tag in s0_tags:
            if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
                max_dist = 0
                for synset in wn.synsets(word, self.penn_to_wn(tag)):
                    desc = word_tokenize(synset.definition())
                    dist = len(list(set(s1) & set(desc)))
                    if dist > max_dist:
                        max_dist = dist
                total_dist += max_dist

        for word, tag in s1_tags:
            if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
                max_dist = 0
                for synset in wn.synsets(word, self.penn_to_wn(tag)):
                    desc = word_tokenize(synset.definition())
                    dist = len(list(set(s0) & set(desc)))
                    if dist > max_dist:
                        max_dist = dist
                total_dist += max_dist

        return total_dist
Exemple #2
0
class CountAdjectives(TransformerMixin):
    """ adds postags, learns weights """
    def __init__(self):
        super(CountAdjectives, self).__init__()
        self.tagger = PerceptronTagger(load=True)
        training_corpus = list(alpino.tagged_sents())
        self.tagger.train(training_corpus)

    def postag(self, x):
        postagged = self.tagger.tag(x.split())
        onlytags = [tt[1] for tt in postagged]
        return onlytags

    def count_adjectives(self, x):
        postagged = self.postag(x)
        totalcount = len(postagged)
        adjlength = postagged.count('adj')
        if adjlength > 0:
            return adjlength / totalcount
        return 0

    def transform(self, X, y=None):
        new_X = [[self.count_adjectives(x)] for x in X]
        return new_X

    def fit(self, X, y=None):
        return self
Exemple #3
0
 def count_common_propper_nouns(self, s0, s1):
     tagger = PerceptronTagger()
     s0_tags = tagger.tag(s0)
     s1_tags = tagger.tag(s1)
     NNP_s0 = [values[0] for values in s0_tags if values[1] == 'NNP']
     NNP_s1 = [values[0] for values in s1_tags if values[1] == 'NNP']
     return len(set(NNP_s0) & set(NNP_s1))
def smaller_subtree_containing_the_drugs(sentence, target_drugs):
    tree_string = nlp.annotate(sentence,
                               properties={
                                   'annotators': 'parse',
                                   'outputFormat': 'json'
                               })
    tagger = PerceptronTagger()
    best_subtree = None
    size = 9999999
    target_drugs = [dr for drug in target_drugs for dr in drug.split(' ')]
    for s in tree_string['sentences']:
        tree_parsed = Tree.fromstring(s['parse'])
        for subtree in tree_parsed.subtrees():
            #         print(subtree.pretty_print())
            leafs = subtree.leaves()
            current_size = len(leafs)
            if all_drugs_in_tree(target_drugs, leafs):
                if current_size < size:
                    best_subtree = subtree
                    size = current_size
        #                 print(subtree.leaves())

    try:
        clean = clean_sentence(best_subtree.leaves())
    except:
        clean = clean_sentence(sentence.split())
    # print('clean',clean)
    tagged = tagger.tag(clean)
    # print('tag:', tagged)
    lemmatized = preprocessor_lemmatize(tagged)
    # print('lemmatized', lemmatized)
    new_sentence = ' '.join([l for l, t in lemmatized])

    return new_sentence
def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    #print content

    if Verbose:
        echo2("Incoming content is "+content)
    PICKLE = "averaged_perceptron_tagger.pickle"
    AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
    tagger = PerceptronTagger(load=False)
    tagger.load(AP_MODEL_LOC)
    pos_tag = tagger.tag
    start = time.time()
    #date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt, 'NE')
    #names.extend(date_time)
    result = {"result" : "success", "names" : names}
    if Units:
        grammar = '''unit: {<CD><NNS>?<NN.*>?},
                     unit: {<CD><JJ>?<NN.*>}
                  '''
        parser = nltk.RegexpParser(grammar)
        units = extract_entity_names(parser.parse(tagged),'unit')
        result['units'] = units
    jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
    end = time.time()
    print "NER took "+str(end - start)+" seconds"
    return jsonDoc
Exemple #6
0
 def train_corpus_to_tag():
     """
     Train tagger on Alpino Corpus
     :return: model tagger  <type: 'model'>
     """
     alp_tagged_sent = list(alp.tagged_sents())
     tagger = PerceptronTagger(load=False)
     tagger.train(alp_tagged_sent)
     return tagger
    def test_perceptron_tagger(self):
        tagger = PerceptronTagger(load=False)
        tagger.train(self.corpus)

        encoded = self.encoder.encode(tagger)
        decoded = self.decoder.decode(encoded)

        self.assertEqual(tagger.model.weights, decoded.model.weights)
        self.assertEqual(tagger.tagdict, decoded.tagdict)
        self.assertEqual(tagger.classes, decoded.classes)
class NLTKTagger:
    '''
	class that supplies part of speech tags using NLTK
	note: avoids the NLTK downloader (see __init__ method)
	'''
    def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        #return pkgutil.get_data('scattertext',
        #                        'data/viz/semiotic_new.html').decode('utf-8')
        path = os.path.dirname(sys.modules['scattertext'].__file__) + '/data/'
        tokenizer_fn = path + 'punkt.english.pickle'
        tagger_fn = path + 'averaged_perceptron_tagger.pickle'
        #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
        #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)

    # http://www.nltk.org/book/ch05.html
    def tag_text(self, text):
        '''take input text and return tokens w/ part of speech tags using NLTK'''
        # putting import here instead of top of file b.c. not all will have nltk installed

        sents = self.sent_detector.tokenize(
            text
        )  # TODO: this will fail on some unicode chars. I think assumes ascii
        word_pos_pairs = []

        all_tokens = []
        for sent in sents:
            tokens = self.tokenize(sent)
            all_tokens = all_tokens + tokens
            word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
        return {
            'tokens': all_tokens,
            'pos': [tag for (w, tag) in word_pos_pairs]
        }

    def tag_tokens(self, tokens):
        word_pos_pairs = self.tagger.tag(tokens)
        return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}
    def syllable_pos_setup(self):
        """Sets up syllables and POS tagging"""
        en_list = ['en_CA', 'en_PH', 'en_NA', 'en_NZ', 'en_JM', 'en_BS', 'en_US',
                   'en_IE', 'en_MW', 'en_IN', 'en_BZ', 'en_TT', 'en_ZA', 'en_AU',
                   'en_GH', 'en_ZW', 'en_GB']

        for lang in en_list:
            if not dictools.is_installed(lang): dictools.install(lang)

        self.cmu_dict = cmudict.dict()

        # sets up POS
        try:
            nltk.pos_tag(['test'])
            self.pos_tag = nltk.pos_tag
        except urllib2.URLError:
            PICKLE = "averaged_perceptron_tagger.pickle"
            AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE))
            tagger = PerceptronTagger(load=False)
            tagger.load(AP_MODEL_LOC)
            self.pos_tag = tagger.tag

        self.tag_dict = {'NN': 'Noun', 'FW': 'Noun', 'JJ': 'Adjective', 'VB': 'Verb',
                         'IN': 'Preposition', 'CC': 'Conjunction',
                         'RP': 'Connector', 'TO': 'Connector', 'MD': 'Connector',
                         'RB': 'Adverb', 'WR': 'Wh-adverb',
                         'DT': 'DetPro', 'WD': 'DetPro', 'PD': 'DetPro', 'PR': 'DetPro', 'WP': 'DetPro',
                         'CD': 'Cardinal',
                         'EX': 'Existential there'}

        ##        self.tag_dict={'NN':'Noun', 'JJ':'Adjective','RB':'Adverb','VB':'Verb',
        ##          'IN':'Preposition','PR':'Pronoun','CC':'Conjunction',
        ##          'RP':'Particle','WR':'Wh-adverb','DT':'Determiner',
        ##          'TO':'To','MD':'Modal Aux','CD':'Cardinal', 'PD':'Predeterminer',
        ##          'WD':'Wh-determiner', 'WP':'Wh-pronoun','EX':'Existential there'}

        # POS which are allowed to happen twice in a row
        self.pos_double = []  # ['Noun','Adjective']

        # POS which can only occur sequentially
        # i.e. an Adverb must occur in fron of a verb
        self.pos_lead = {'Adverb': ['Verb'], 'Pronoun': ['Noun'], 'Adjective': ['Noun'],
                         'Preposition': ['Noun', 'Pronoun']}

        # POS which cannot occur sequentially
        # i.e. a preposition cannot come before a verb
        self.pos_restrict_lead = {'Preposition': 'Verb',}

        return
Exemple #10
0
    def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)
def extract_tokens(row, lemmatize=True, use_tag=True):
    tokenizer = WhitespaceTokenizer()
    if lemmatize:  # reduce words to lemmas
        pattern = '[().*+,?!\'\";:]*'
        token_list = list()
        if use_tag:  # use POS tags to obtain more accurate lemmas
            pos_tags = PerceptronTagger().tag(tokenizer.tokenize(row['text']))
            lemmatizer_input = map(
                lambda x: (x[0], nltk_to_wordnet.get(x[1][0])), pos_tags)
            lemmatizer = WordNetLemmatizer()
            for word, tag in lemmatizer_input:
                if word != 'urlLink' and 'http:' not in word:
                    word = word.lower()
                    if tag is None:
                        tok = lemmatizer.lemmatize(word)
                        tok = re.sub(pattern, '', tok)
                        if not tok.isdigit():
                            token_list.append(tok)
                    else:
                        tok = lemmatizer.lemmatize(word, tag)
                        tok = re.sub(pattern, '', tok)
                        if not tok.isdigit():
                            token_list.append(tok)
        else:  # do not use a tagger if not specified and speed up computation
            lemmatizer_input = tokenizer.tokenize(row['text'])
            lemmatizer = WordNetLemmatizer()
            for word in lemmatizer_input:
                if word != 'urlLink' and 'http:' not in word:
                    tok = lemmatizer.lemmatize(word.lower())
                    tok = re.sub(pattern, '', tok)
                    if not tok.isdigit():
                        token_list.append(tok)
    else:  # simply tokenize based on whitespaces
        token_list = tokenizer.tokenize(row['text'])
    return token_list
    def CorpusListPhrase(self, matrix, stopwords):

        phrase_list = []
        grammar = r"""
            NBAR:
                {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
                
            NP:
                {<NBAR>}
                {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
        """

        tagger = PerceptronTagger()
        pos_tag = tagger.tag
        # Create phrase tree
        chunker = nltk.RegexpParser(grammar)
        for doc in matrix:
            phrase = self._flatten([
                word for word in self._getTerms(
                    chunker.parse(pos_tag(re.findall(r'\w+', str(doc)))))
                if word not in stopwords
            ])
            phrase_list.append(",".join(phrase))

        return phrase_list
class NLTKTagger:
	'''
	class that supplies part of speech tags using NLTK
	note: avoids the NLTK downloader (see __init__ method)
	'''

	def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html
	def tag_text(self, text):
		'''take input text and return tokens w/ part of speech tags using NLTK'''
		# putting import here instead of top of file b.c. not all will have nltk installed

		sents = self.sent_detector.tokenize(text)  # TODO: this will fail on some unicode chars. I think assumes ascii
		word_pos_pairs = []

		all_tokens = []
		for sent in sents:
			tokens = self.tokenize(sent)
			all_tokens = all_tokens + tokens
			word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
		return {'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}

	def tag_tokens(self, tokens):
		word_pos_pairs = self.tagger.tag(tokens)
		return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}
    def __init__(self, job_title_col, url_col, description_col, label_col,
                 word_col, encoded_job_title_col, indeed_file, words_file):
        '''       
        Parameters
        ----------
        job_title_col: str. column name that contains the job titles of the job postings
        url_col: str. column name that contains the urls of the job postings
        description_col: str. column name that contains the job descriptions of the job postings
        label_col: str. column name that contains the job group in set 
                        {"Data Scientist", "Machine Learning Engineer", "Data Engineer","Data Analyst", "None"}
        word_col: str. column name that contains the hard skills
        encoded_job_title_col: str. column name that contains the encoded job group
        df_indeed: pandas df. the dataframe with the scraped job postings
        df_words: pandas df. the dataframe with the hard skills
        '''
        #intialize attributes related to dataset
        self.job_title_col = job_title_col
        self.url_col = url_col
        self.description_col = description_col
        self.label_col = label_col
        self.word_col = word_col
        self.encoded_job_title_col = encoded_job_title_col

        #load the scraped files
        self.df_indeed = self._load_data(indeed_file)
        self.df_words = self._load_data(words_file)

        #initialize attributes related to extracted features
        self.job_description = None
        self.word_list = None
        self.features_list_single = []
        self.features_list_phrase = []
        self.topk_single = None
        self.topk_phrase = None
        self.topk_full = None
        self.df_single = pd.DataFrame()
        self.df_phrase = pd.DataFrame()
        self.df = pd.DataFrame()
        self.df_tools = pd.DataFrame()
        self.df = pd.DataFrame()
        self.top_tools_dict = {}

        # Initialize attributes related to keyphrase extraction
        self.grammar = self._initialize_grammar()
        self.stop = self._initialize_stopwords()
        self.text = """ initialize """
        self.tagger = PerceptronTagger()
        self.pos_tag = self.tagger.tag
        self.chunker = nltk.RegexpParser(self.grammar)
        self.taggedToks = self.pos_tag(re.findall(r'\w+', self.text))
        self.tree = self.chunker.parse(self.taggedToks)

        #perform pre-processing, feature extraction and post-processing
        self._execute_pre_processing()
        self._execute_feature_extraction()
        self._execute_post_processing()
    def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        #return pkgutil.get_data('scattertext',
        #                        'data/viz/semiotic_new.html').decode('utf-8')
        path = os.path.dirname(sys.modules['scattertext'].__file__) + '/data/'
        tokenizer_fn = path + 'punkt.english.pickle'
        tagger_fn = path + 'averaged_perceptron_tagger.pickle'
        #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
        #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)
Exemple #16
0
    def tagger(self):
        """
        Usage:

        training_corpus = list(alp.tagged_sents())
        tagger = PerceptronTagger(load=True)

        tagger.train(training_corpus)

        #sent = 'NLTK is een goeda taal voor het leren over NLP'.split()

        print(tagger.tag(article_text.split()))
        :return:
        """

        # Load Corpus
        training_corpus = list(alp.tagged_sents())
        tagger = PerceptronTagger(load=True)

        # Build tagger
        tagger.train(training_corpus)

        return tagger.tag(self.string.split())
Exemple #17
0
def status():
    from autogoal.contrib import ContribStatus

    try:
        from nltk.corpus import wordnet
        from nltk.corpus import sentiwordnet
        from nltk.corpus import stopwords

        from nltk.stem import RSLPStemmer

        st = RSLPStemmer()

        from nltk.tag import PerceptronTagger

        tagger = PerceptronTagger()
    except LookupError:
        return ContribStatus.RequiresDownload

    return ContribStatus.Ready
Exemple #18
0
def train_tagger(language, model_type, feature, train_sents):
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
    elif model_type == 'crf':
        tagger = CRFTagger()
        tagger.train(train_sents,
                     'taggers/{0}/{1}/crf.pickle'.format(language, feature))
    elif model_type == 'perceptron':
        tagger = PerceptronTagger(load=False)
        tagger.train(train_sents)

    return tagger
	def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)
Exemple #20
0
    def __init__(self, language, stanford=False):

        if not language:
            raise ValueError("No language specified for POS tagging")
        else:
            self._language = language

        if self._language == "eng" and stanford:
            self.model = StanfordPOSTagger(
                r'english-bidirectional-distsim.tagger')
            self.tagger = self.model.tag
        elif self._language == "eng":
            try:
                # "new" nltk with slow default behaviour through high-level API
                from nltk.tag import PerceptronTagger
                self.model = PerceptronTagger()
                self.tagger = self.model.tag
            except ImportError:
                self.model = None
                self.tagger = nltk.pos_tag
        elif self._language == "afr":
            self.model = HunposTagger(join(_MODEL_DIR, "pos-tag-model.af"),
                                      encoding='utf-8')
            self.tagger = self.model.tag
        elif self._language == "nso":
            self.model = HunposTagger(join(_MODEL_DIR,
                                           "simple-pos-tag-model.nso"),
                                      encoding='utf-8')
            self.tagger = self.model.tag
        elif self._language == "zul":
            #self.model = MarmotTagger(encoding='utf-8')
            self.model = HunposTagger(join(_MODEL_DIR,
                                           "simple-pos-tag-model.zu"),
                                      encoding='utf-8')
            self.tagger = self.model.tag
        else:
            raise ValueError(
                'Language "%s" not supported for POS tagging.\nSupply a 3 letter code form ISO-639.'
                % self._language)
Exemple #21
0
    def __init__(self,
                 df,
                 review_col,
                 truth_col,
                 copy=True,
                 analyzer=None,
                 stop_words=stopwords.words('english'),
                 pos_tag=PerceptronTagger().tag,
                 parse=RegexpParser(grammar).parse,
                 lemmatize=WordNetLemmatizer().lemmatize):

        # DataFrame stuffs
        self.df = df.copy() if copy else df
        self.review_col = review_col
        self.truth_col = truth_col

        # NLP stuffs
        self.analyzer = self.vader if analyzer is None else analyzer
        self.stop_words = stop_words
        self.pos_tag = pos_tag
        self.parse = parse
        self.lemmatize = lemmatize
from gensim.models import Phrases
from gensim.utils import SaveLoad
from gensim.models.phrases import Phraser
from nltk.corpus import stopwords  # Import the stop word list
import timeit
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer
# from nltk import pos_tag, word_tokenize
from nltk.tag import PerceptronTagger
from collections import OrderedDict

# Pywsd's Lemmatizer.
porter = PorterStemmer()
wnl = WordNetLemmatizer()
tagger = PerceptronTagger()
pos_tag = tagger.tag
tokenizer = RegexpTokenizer(r'\w+')


def lemmatize(ambiguous_word,
              pos=None,
              neverstem=True,
              lemmatizer=wnl,
              stemmer=porter):
    """
    Tries to convert a surface word into lemma, and if lemmatize word is not in
    wordnet then try and convert surface word into its stem.
    This is to handle the case where users input a surface word as an ambiguous 
    word and the surface word is a not a lemma.
    """
def main():

    training_corpus = list(alp.tagged_sents())
    global tagger
    tagger = PerceptronTagger()
    tagger.train(training_corpus)
    num = 2138
    dic = {}

    Xtrain = []
    Ytrain = []
    with open("trainGxG/GxG_News.txt") as txt:
        for line in txt:
            if line[0:8] == "<doc id=":
                Ytrain.append(line.split()[3][8])
                string=[line.split('\"')[1]]
                dic[line.split('\"')[1]] = line.split()[3][8]
            elif line[0:6] == "</doc>":
                Xtrain.append(" ".join(string))
            else:
                string.append(line)

    Xtest = []
    with open("testGxG/GxG_News.txt") as txt:
        for line in txt:
            if line[0:8] == "<doc id=":
                string=[]
            elif "</doc>" in line:
                Xtest.append(" ".join(string))
            else:
                string.append(line)

    Ytest = []
    with open("testGxG/GxG_News_gold.txt") as text:
        for line in text:
            Ytest.append(line.split()[1])

    sentences = []
    for i in Xtrain[:num]:
        sentences.append(preprocess(i))


    nlp = spacy.load('nl_core_news_sm')
    veclist = []

    for sentence in sentences:
        doc = nlp(sentence)
        vec = doc.vector 
        veclist.append(vec)

    X = np.array(veclist)

    clf = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None)
    labels = clf.fit_predict(X)
    pca = PCA(n_components=2).fit(X)
    coords = pca.transform(X)

    lst = []

    for index, sentence in enumerate(sentences):
        plt.text(coords[index].tolist()[0],coords[index].tolist()[1], str(dic[sentence.split()[0]]) + str(labels[index]) + ":" + str(sentence)[0:10], fontsize=4)
        lst.append(str(dic[sentence.split()[0]]) + str(labels[index]))

    label_colors=["red", "blue", "green", "yellow", "black", "purple", "cyan"]
    colors = [label_colors[i] for i in labels]
    plt.scatter(coords[:, 0], coords[:, 1], c=colors)
    centroids = clf.cluster_centers_
    centroid_coords = pca.transform(centroids)
    plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker="X", s=200, linewidth=2, c="#444d61")

    print(Counter(labels))

    genders = []
    for i,j in enumerate(sentences):
        if i < num:
            genders.append(dic[j.split()[0]])
    print(Counter(genders))
    print(Counter(lst))
    plt.show()
Exemple #24
0
def main(file_input):
    data_df = pd.read_csv(str(file_input) + '.csv')
    data_df = shuffle(data_df)

    print("Loaded .csv file Successfully")

    print("Total Number of Samples:", data_df.shape[0])
    print("Total Number of Features:", data_df.shape[1])

    # Missing Values
    # column with maximum missing values

    def missing_value(data_df):
        while data_df.isnull().sum().values.sum() != 0:
            col_with_missing_val = (data_df.isnull().sum()).argmax()
            data_df = data_df[data_df[col_with_missing_val].notnull(
            )]  # drop corresponding rows that has NaN values
            print("Missing Values in Features:", col_with_missing_val)
        return data_df

    #  Missing Value Treatment:
    print("Missing Value Treatment : Start")
    data_df = missing_value(data_df)
    print("Missing Value Treatment : Stop")
    print("Total Number of Samples:", data_df.shape[0])
    print("Total Number of Features:", data_df.shape[1])

    # pattern matcher for candidate feature
    #  newly Added Features : Dates format, currency format, number of digits per candidate, number of separators
    # per candidate
    print("Computing Pattern Transformers: Start")

    pattern_strictlyDigits = "^[0-9]*$"
    pattern_endWithCharacters = "^\d*[\/.,@$!)(]$"  # Only digits + end with special characters
    pattern_telephone = "^0[0-9]{12}$"
    pattern_vat = "^0?[0-9]{9}$"
    pattern_date = '^[0-3]?[0-9](\/|\,|\.|\-){1}[0-9]?[0-9](\/|\,|\.|\-){1}[0-2][0-9]{1,3}$'

    pattern_currency_1 = '^[0-9]\.[0-9]+\,[0-9]*$'  # captures ddddd,dddd
    pattern_currency_2 = '^[0-9]+\,[0-9]+$'
    data_df['currency_filter'] = data_df['candidate'].str.contains(pattern_currency_1, regex=True).astype(np.int64)\
                                 | data_df['candidate'].str.contains(pattern_currency_2, regex=True).astype(np.int64)

    data_df['dates_filter'] = data_df['candidate'].str.contains(
        pattern_date, regex=True).astype(np.int64)
    data_df["Is_strictly_Digits"] = data_df["candidate"].str.contains(
        pattern_strictlyDigits, regex=True).astype(np.int64)
    data_df["endWithCharacters"] = data_df["candidate"].str.contains(
        pattern_endWithCharacters, regex=True).astype(np.int64)
    data_df["Number_of_Digits"] = data_df['candidate'].apply(
        lambda x: len(re.sub("\W", "", x)))
    data_df["Number_of_Separators"] = data_df['candidate'].apply(
        lambda x: len(re.sub("\w", "", x)))
    data_df["Length_of_Candidate"] = data_df['candidate'].apply(
        lambda x: len(x))

    # included the country code
    data_df["Telephone"] = data_df["candidate"].str.contains(
        pattern_telephone, regex=True).astype(np.int64)
    # VAT number contains 9 to 10 digits
    data_df["VATNumber"] = data_df["candidate"].str.contains(
        pattern_vat, regex=True).astype(np.int64)

    # drop blacklisted variables
    dates_index = data_df.index[data_df['dates_filter'] == 1].tolist()
    data_df = data_df.drop(index=dates_index, axis=0)
    data_df = data_df.drop("dates_filter", axis=1)
    currency_index = data_df.index[data_df['currency_filter'] == 1].tolist()
    data_df = data_df.drop(index=currency_index, axis=0)
    data_df = data_df.drop(["currency_filter"], axis=1)
    telephone_index = data_df.index[data_df['Telephone'] == 1].tolist()
    data_df = data_df.drop(index=telephone_index, axis=0)
    data_df = data_df.drop(["Telephone"], axis=1)
    vat_index = data_df.index[data_df['VATNumber'] == 1].tolist()
    data_df = data_df.drop(index=vat_index, axis=0)
    data_df = data_df.drop(["VATNumber"], axis=1)
    vat_index = data_df.index[data_df['endWithCharacters'] == 1].tolist()
    data_df = data_df.drop(index=vat_index, axis=0)
    data_df = data_df.drop(["endWithCharacters"], axis=1)

    print("Computing Pattern Transformers: Stop")

    # NLP Techniques:
    # Tokenization, Stemming, lemmatization, Frequency Distribution, Bag of words approach

    # Combine three text columns to single column - This columns contains he full text
    data_df["Text"] = data_df["line_before"] + data_df["line_at"] + data_df[
        "line_after"]

    print("Computing Context Transformers: Start")

    # Context Transformers
    def email_match(doc):
        match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc))
        if match != None:
            return 1
        else:
            return 0

    data_df["Number_of_Characters_Text"] = data_df["Text"].apply(
        lambda x: len(re.sub("[^a-z]", "", str(x))))
    data_df["Number_of_Digits_Text"] = data_df["Text"].apply(
        lambda x: len(re.sub("[^0-9]+", "", str(x))))
    data_df["Number_of_Separators_Text"] = data_df["Text"].apply(lambda x: len(
        (re.sub("[\w]+", "", str(x))).replace(" ", "")))
    data_df["Is_Email_Exists"] = data_df["Text"].apply(
        email_match)  # place 1 everywhere email found else 0
    data_df["Number_of_spaces"] = data_df["Text"].apply(
        lambda x: str(x).count(' '))  # counts number of spaces,

    # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language
    ss = SnowballStemmer("dutch", "french")

    def clean_data(doc):
        ignore = list(set(stopwords.words(
            'dutch', 'french')))  # ignore the list of stopwords
        exl_chars = list(set(string.punctuation))
        exl_chars.append('€')
        # remove email ids to avoid conflicts in vocabulary construction
        doc = re.sub("[\w\.-]+@[\w\.-]+", " ", str(doc))
        doc = re.sub("\d", " ", str(doc))
        doc = ''.join([ch for ch in doc if ch not in exl_chars])
        words = []
        for i in word_tokenize(doc):  # tokenization
            if i not in ignore:
                if len(i) >= 2:  # standalone letters do not add any value
                    i = ss.stem(i)
                    words.append(i)
        doc = ' '.join(list(set(words)))
        return doc

    print("Cleaning Text Data: Start")
    data_df["Text"] = data_df["Text"].apply(
        clean_data)  # tokenize, stem and lammetize
    print("Cleaning Text Data: Stop")

    print("Computing POS Vectors: Start")

    # training_corpus = alp.tagged_sents()
    alp_tagged_sent = list(alp.tagged_sents())
    tagger = PerceptronTagger(load=False)
    tagger.train(alp_tagged_sent)

    def count_adj(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_adj_adv = counts['adv'] + counts['adj']
        return count_adj_adv

    def count_nn(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_nn = counts['noun']
        return count_nn

    def count_verb(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_verb = counts['verb']
        return count_verb

    data_df["Adv_Adj_Count"] = data_df["Text"].apply(count_adj)
    data_df["NN_count"] = data_df["Text"].apply(count_nn)
    data_df["Verb_count"] = data_df["Text"].apply(count_verb)

    print("Computing POS Vectors: Stop")

    print("Computing Vocabulary: Start")

    # store all the words in positive class and negative in two separate lists
    docs_pos = []

    docs_pos.extend(
        word_tokenize(words) for words in data_df.Text[data_df.gold == 1])

    docs_pos = list(itertools.chain(*docs_pos))

    # Clean text data - remove words like --- iiiiiii, hhhhhccchhhh, abvwwwwwcgdccc
    for i in docs_pos:
        first_3_characters = i[:3]
        last_3_characters = i[-3:]
        if len(i) >= 3 and first_3_characters[0] == first_3_characters[
                1] == first_3_characters[2]:
            docs_pos.remove(i)
        if i in docs_pos and len(i) >= 3 and last_3_characters[
                0] == last_3_characters[1] == last_3_characters[2]:
            docs_pos.remove(i)

    print("Positve class words are stored successfully")

    all_words_pos = nltk.FreqDist(docs_pos)

    print("Computing vocabulary based on Positive Class")
    # find popular words, popular equals more than 25 times in the corpus
    popular_pos_words = []
    for i in all_words_pos.items():
        if i[1] >= 25:
            popular_pos_words.append(i[0])

    # Filter nouns from the popular positive class words
    tagged_pos_words = tagger.tag(popular_pos_words)
    filtered_tag_pos_words_nouns = []
    for word in tagged_pos_words:
        if word[1] == 'noun':
            filtered_tag_pos_words_nouns.append(word[0])
    vocab_pos = list(set(filtered_tag_pos_words_nouns))
    vocabulary = list(set(vocab_pos))

    # save vocabulary
    with open("vocab.txt", "wb") as fp:
        pickle.dump(vocabulary, fp)

    print("Computing Vocabulary: Stop")

    print("Length of Vocabulary: ", len(vocabulary))

    print("Computing Bag of Words Vectors: Start")

    def build_features(doc):
        vector = np.zeros((1, len(vocabulary)), dtype=np.int64)
        for w in word_tokenize(doc):
            for idx, vocab in enumerate(vocabulary):
                if vocab == w:
                    vector[0][idx] += 1
        return vector

    bag_vectors = data_df["Text"].apply(build_features)

    feature_vectors = np.zeros((data_df.shape[0], len(vocabulary)),
                               dtype=np.int64)
    for pos, index in enumerate(data_df.index.values):
        feature_vectors[pos, :] = bag_vectors[index]

    cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))]
    for col_index, col in enumerate(cols):
        data_df[col] = feature_vectors[:,
                                       col_index].reshape(data_df.shape[0], 1)

    print("Computing Bag of Words Vectors: Stop")

    print("Computing Context Transformers: Stop")

    print("Computing Location Transformers: Start")

    data_df["location_page_nr"] = data_df["page_nr"].apply(lambda x: 100
                                                           if x >= 50 else x)
    data_df["location_line_nr"] = data_df["line_nr"].apply(lambda x: 100
                                                           if x >= 50 else x)

    print("Computing Location Transformers: Stop")

    print("Total Number of Newly Added Features:", data_df.shape[1] - 7)

    print("Building ML - Neural Network Model: Start")

    X = data_df.drop([
        "candidate", "Text", "gold", "label", "line_after", "line_at",
        "line_before", "line_nr", "page_nr"
    ],
                     axis=1)
    y = data_df.gold
    #  Normalisation
    X = (X - X.mean(axis=0)) / X.std(axis=0)

    def build_model(input_shape):
        model = Sequential()
        model.add(Dense(1024, input_shape=(input_shape, )))
        model.add(Activation('sigmoid'))

        model.add(Dense(512))
        model.add(Activation('sigmoid'))

        model.add(Dense(128))
        model.add(Activation('sigmoid'))

        model.add(Dense(1, activation="sigmoid"))

        model.compile(optimizer='adam',
                      loss=tf.keras.losses.mean_squared_error,
                      metrics=['accuracy'])
        return model

    #  Stratified k-Fold
    k_fold_outer = model_selection.StratifiedKFold(n_splits=5)
    scores = []
    split = 0
    for train_index, test_index in k_fold_outer.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        model = build_model(X_train.shape[1])
        history = model.fit(X_train,
                            y_train,
                            epochs=5,
                            batch_size=1024,
                            verbose=1)
        results = model.evaluate(X_val, y_val)
        scores.append(results[1])
        split += 1
        del model, history, results

    model = build_model(X.shape[1])
    model.fit(X, y, verbose=0)

    print('Saving the Model *.h5...')
    model.save('model_candidate_filter.h5')

    yHat_proba = model.predict(X)
    yHat = np.copy(yHat_proba)
    yHat[yHat <= 0.5] = 0
    yHat[yHat > 0.5] = 1

    br_score = np.around(metrics.brier_score_loss(y, yHat_proba, pos_label=1),
                         decimals=5)
    print("Storing Results in .csv file")

    confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1]))
    for i in range(0, yHat_proba.shape[0]):
        if yHat_proba[i] <= 0.5:
            confidence[i] = 1 - yHat_proba[i]
        else:
            confidence[i] = yHat_proba[i]

    results_data_frame = pd.DataFrame(
        columns=["Predictions", "Confidence Level"], index=data_df.index)
    results_data_frame["Predictions"] = yHat.astype(np.int64).ravel()
    results_data_frame["Confidence Level"] = np.around(confidence, decimals=4)
    results_data_frame.to_csv("Results_predictions_confidence_train.csv",
                              encoding='utf-8',
                              header=True,
                              index=True)

    return np.mean(scores), br_score
Exemple #25
0
def labelClustersWithKeyPhrases(labels, myReader, num_clusters, n):
    top_features_list = []

    tagger = PerceptronTagger()
    pos_tag = tagger.tag
    grammar = r"""
        NBAR:
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
    """
    # Create phrase tree
    chunker = nltk.RegexpParser(grammar)

    stop = ENGLISH_STOP_WORDS

    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()

    # generator, generate leaves one by one
    def leaves(tree):
        """Finds NP (nounphrase) leaf nodes of a chunk tree."""
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP' or t.label() == 'JJ' or t.label() == 'RB'):
            yield subtree.leaves()

    # stemming, lematizing, lower case...
    def normalise(word):
        """Normalises words to lowercase and stems and lemmatizes it."""
        word = word.lower()
        word = stemmer.stem(word)
        word = lemmatizer.lemmatize(word)
        return word

    # stop-words and length control
    def acceptable_word(word):
        """Checks conditions for acceptable word: length, stopword."""
        accepted = bool(2 <= len(word) <= 40
                        and word.lower() not in stop)
        return accepted

    # generator, create item once a time
    def get_terms(tree):
        for leaf in leaves(tree):
            term = [normalise(w) for w, t in leaf if acceptable_word(w)]
            # Phrase only
            if len(term) > 1:
                yield term

    def flatten(npTokenList):
        finalList = []
        for phrase in npTokenList:
            token = ''
            for word in phrase:
                token += word + ' '
            finalList.append(token.rstrip())
        return finalList

    for cluster in range(num_clusters):
        indices = [index for index, clusterNum in enumerate(labels) if clusterNum == cluster] # indices of documents in cluster
        clusterCorpus = [doc_dict['negative_feedback'] for (docnum, doc_dict) in myReader.iter_docs() if docnum in indices] #
        clusterCorpus = ' '.join(clusterCorpus)

        counter = Counter()
        counter.update(flatten([word
                                for word
                                in get_terms(chunker.parse(pos_tag(re.findall(r'\w+', clusterCorpus))))
                                ]))

        most_common_n = counter.most_common(n)

        top_features = [feature[0] for feature in most_common_n]
        top_features_list.append(top_features)

    feature_names_df = pd.DataFrame(top_features_list, columns=['1', '2', '3', '4', '5'])

    return feature_names_df
Exemple #26
0
 def __init__(self):
     nltk.download('averaged_perceptron_tagger')
     self.tagger = PerceptronTagger()
     self.lemmatizer = WordNetLemmatizer()
     self.stopwords = list(stopwords.words('english'))
     self.auto_correct_remaining = 0
Exemple #27
0
 def count_nouns(self, s0):
     tagger = PerceptronTagger()
     s0_tags = tagger.tag(s0)
     NN_s0 = [values[0] for values in s0_tags if values[1] == 'NN']
     return len(NN_s0)
from collections import Counter
#count_good_raw = Counter(good_raw)
count_good_actors = Counter(good_actors)
count_good_actions = Counter(good_actions)
#number of statements
nos = len(tokenized_actions)
#number of good actors
noga = len(count_good_actors)
#number of good actors
nogc = len(count_good_actions)

PICKLE = "taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle"
import nltk.data
from nltk.tag import PerceptronTagger
_nltk_pos_tagger = PerceptronTagger(load=False)
_nltk_pos_tagger.load(PICKLE)
print(count_good_actors)
S = np.zeros(shape=(nos, noga + nogc))
i = 0
for sent_pos in tokenized_actors:
    for token1 in sent_pos:
        j = 0
        tt1 = _nltk_pos_tagger.tag([token1])
        for feature in count_good_actors:
            ft = _nltk_pos_tagger.tag([feature])
            simval = word_sim(tt1[0], ft[0], i)
            S[i][j] = S[i][j] + simval
            j = j + 1
    i = i + 1
Exemple #29
0
from nltk.tag import PerceptronTagger
from nltk.data import find
import glob


#code for loading perceptron tagger 
PICKLE = "averaged_perceptron_tagger.pickle"
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
tagger = PerceptronTagger(load=False)
tagger.load(AP_MODEL_LOC)
pos_tag = tagger.tag


#list to store POS and NP lists generated from each file
GlobalPOSList=[]
GlobalNPList=[]

#getting filenames of dataset files
fileList=glob.glob("C:/Users/Vinod Chhapariya/Desktop/TDBMS/Benchmark Dataset/*.txt")

#printing filenames
for filename in fileList:
        print filename

#POS tagging using Preceptron tagger        
for filename in fileList:
        POSList=[]
        NPList=[]
        filePOSTagWrite=open(filename+"_POSTag_Perceptron",'w')
        for line in open(filename,'r').readlines():
                tags=pos_tag(line.split())
import nltk
from nltk.tag import PerceptronTagger
from nltk.data import find

def extract_entity_names(t, label):
    entity_names = []
    if hasattr(t, 'label') and t.label:
        if t.label() == label:
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child, label))
    return entity_names

PICKLE = "averaged_perceptron_tagger.pickle"
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
tagger = PerceptronTagger(load=False)
tagger.load(AP_MODEL_LOC)
pos_tag = tagger.tag
tokenized = nltk.word_tokenize('The quick brown fox  named Ron and Donald Trump jumps over the lazy dog')
tagged = pos_tag(tokenized)
namedEnt = nltk.ne_chunk(tagged, binary=True)
names = extract_entity_names(namedEnt, 'NE')
#check=pos_tag('The quick brown fox jumps over the lazy dog'.split())
print names

    def get_keyphrases(self, textInput, min_freq=2):

        # setting up tagger
        # (from http://stackoverflow.com/a/35964709)
        PICKLE = "averaged_perceptron_tagger.pickle"
        AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE))
        tagger = PerceptronTagger(load=False)
        tagger.load(AP_MODEL_LOC)

        lemmatizer = nltk.WordNetLemmatizer()
        stemmer = nltk.stem.porter.PorterStemmer()

        # This grammar is described in the paper by S. N. Kim,
        # T. Baldwin, and M.-Y. Kan.
        # Evaluating n-gram based evaluation metrics for automatic
        # keyphrase extraction.
        # Technical report, University of Melbourne, Melbourne 2010.

        StopWords = stopwords.words('english')

        def leaves(tree):
            """Finds NP (nounphrase) leaf nodes of a chunk tree."""
            for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
                yield subtree.leaves()

        def acceptable_word(word):
            """Checks conditions for acceptable word: length, stopword."""
            accepted = bool(2 < len(word) and word.lower() not in StopWords)
            return accepted

        def normalise(word):
            """Normalises words to lowercase and stems and lemmatizes it."""
            word = word.lower()
            word = stemmer.stem(word)
            word = lemmatizer.lemmatize(word)
            return word

        def get_terms(tree):
            for leaf in leaves(tree):
                # can modify normalise to w.lower() if dont want to normalize word
                term = [normalise(w) for w, t in leaf if acceptable_word(w)]
                yield term

        def get_nounPhrases(textInput, minWordLength=2):

            grammar = r"""

            NBAR:
                {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

            NP:
                {<NBAR>}
                {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
                      """

            chunker = nltk.RegexpParser(grammar)

            toks = nltk.word_tokenize(textInput)
            # print(toks)
            pos_tag = tagger.tag
            postoks = pos_tag(toks)

            tree = chunker.parse(postoks)
            terms = get_terms(tree)

            nounPhraseList = []
            for tid, term in enumerate(terms):
                templist = []
                for wid, word in enumerate(term):
                    # print("TID: ",tid," WID: ",(wid+1), word)
                    templist.append(word)

                s = " "
                nounPhraseList.append(s.join(templist))

            nounPhraseList = [word for word in nounPhraseList if len(word.split()) >= minWordLength]
            return nounPhraseList

        counter = Counter()
        for nounPhrase in get_nounPhrases(textInput):
            # print(nounPhrase)
            counter.update([nounPhrase])

        keyphraseDF = pandas.DataFrame([[key, value] for key, value in counter.items() if value>=min_freq],
                                       columns=['keyphrase_stemmed', 'frequency'])
        (docsDF, occurrenceDF) = self.get_occurrence(keyphraseDF)
        print("docs", docsDF)
        print("keys", keyphraseDF)
        keyphraseDF = keyphraseDF.join(docsDF["docs"])
        print(occurrenceDF)
        keyphraseDF = keyphraseDF.join(self.get_fullphrases(keyphraseDF=keyphraseDF)["keyphrase_full"])
        keyphraseDF = keyphraseDF.join(self.get_MIs(occurrenceDF=occurrenceDF)["MI"])
        keyphraseDF = keyphraseDF.join(
            self.get_PMIs(occurrenceDF=occurrenceDF, metric="sentiment_class", value="positive")["PMI_pos"])
        keyphraseDF = keyphraseDF.join(
            self.get_PMIs(occurrenceDF=occurrenceDF, metric="sentiment_class", value="negative")["PMI_neg"])
        #keyphraseDF = keyphraseDF.join(self.get_PMIs(keyphraseDF["Keyphrase_stemmed"].tolist(), "neg"))

        return keyphraseDF
Exemple #32
0
		#sentence= sentence.rstrip()
		doc = nlp(sentence)
		for token in doc:
			dependency = [token.text, token.dep_,
         	token.shape_, token.is_alpha, token.is_stop,[child for child in token.children]]
			
			
			if dependency[0] == "\n":
				whole_sen.append(parsed)
				parsed=[]

			else:
				parsed.append(dependency)

frysian=[]
tagger = PerceptronTagger()
with open('frysian_data.txt', 'r',  encoding="utf-8") as fr_infile:
	for sentence in fr_infile:
		sentence = word_tokenize(sentence)
		pos = tagger.tag(sentence)
		
		fr.append(pos)
other=[]
final =[]
fr_longer=[]
for k in range(len(fr)):
	fries = fr[k]
	parsed = whole_sen[k]
	if len(fries) == len(parsed):
		for words, fr_words in zip(parsed,fries):
			print(words[0])
Exemple #33
0
def main(file_input):
    test_data = pd.read_csv(str(file_input) + '.csv')
    # test_data = pd.read_csv(str(file_input) + '.csv', index_col='Unnamed: 0')

    print("Loaded .csv file Successfully")

    print("Missing Value Treatment : Start")
    # missing values Treatment
    while test_data.isnull().sum().values.sum() != 0:
        col_with_missing_val = (test_data.isnull().sum()).argmax()
        test_data = test_data[test_data[col_with_missing_val].notnull(
        )]  # drop corresponding rows that has NaN values
        print(col_with_missing_val)

    print("Missing Value Treatment : Stop")
    print("Total Number of Samples:", test_data.shape[0])
    print("Total Number of Features:", test_data.shape[1])

    print("Computing Pattern Transformers: Start")
    # pattern transformers
    pattern_strictlyDigits = "^[0-9]*$"
    test_data["strictly_Digits"] = test_data["candidate"].str.contains(
        pattern_strictlyDigits, regex=True).astype(np.int64)
    test_data["Number_of_Digits"] = test_data['candidate'].apply(
        lambda x: len(re.sub("\W", "", x)))
    test_data["Number_of_Seprators"] = test_data['candidate'].apply(
        lambda x: len(re.sub("\w", "", x)))
    test_data["Length_of_Candidate"] = test_data['candidate'].apply(
        lambda x: len(x))

    print("Computing Pattern Transformers: Stop")
    print("Computing Context Transformers: Start")
    # context transformers
    test_data["Text"] = test_data["line_before"] + test_data[
        "line_at"] + test_data["line_after"]

    def email_match(doc):
        match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc))
        if match != None:
            return 1
        else:
            return 0

    test_data["Number_of_Characters_Text"] = test_data["Text"].apply(
        lambda x: len(re.sub("[^a-z]", "", str(x))))
    test_data["Number_of_Digits_Text"] = test_data["Text"].apply(
        lambda x: len(re.sub("[^0-9]+", "", str(x))))
    test_data["Number_of_Separators_Text"] = test_data["Text"].apply(
        lambda x: len((re.sub("[\w]+", "", str(x))).replace(" ", "")))
    test_data["Email_Exists"] = test_data["Text"].apply(
        email_match)  # place 1 everywhere email found else 0
    test_data["Number_of_spaces"] = test_data["Text"].apply(
        lambda x: str(x).count(' '))  # counts number of spaces

    # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language
    ss = SnowballStemmer("dutch", "french")

    def clean_data(doc):
        ignore = list(set(stopwords.words(
            'dutch', 'french')))  # ignore the list of stopwords
        exl_chars = list(set(string.punctuation))
        exl_chars.append('€')
        doc = re.sub(
            "[\w\.-]+@[\w\.-]+", " ", str(doc)
        )  # remove email ids to avoid confiltcs in vaocabulary construction
        doc = re.sub("\d", " ", str(doc))
        doc = ''.join([ch for ch in doc if ch not in exl_chars])
        words = []
        for i in word_tokenize(doc):  # tokenization
            if i not in ignore:
                if len(i) >= 2:  # standalone letters do not add any value
                    i = ss.stem(i)
                    words.append(i)
        doc = ' '.join(list(set(words)))
        return doc

    test_data["Text"] = test_data["Text"].apply(
        clean_data)  # tokenize, stem and lammetize

    # training_corpus = alp.tagged_sents()
    alp_tagged_sent = list(alp.tagged_sents())
    tagger = PerceptronTagger(load=False)
    tagger.train(alp_tagged_sent)

    def count_adj(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_adj_adv = counts['adv'] + counts['adj']
        return count_adj_adv

    def count_nn(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_nn = counts['noun']
        return count_nn

    def count_verb(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_verb = counts['verb']
        return count_verb

    test_data["Adv_Adj_Count"] = test_data["Text"].apply(count_adj)
    test_data["NN_count"] = test_data["Text"].apply(count_nn)
    test_data["Verb_count"] = test_data["Text"].apply(count_verb)

    print("Computing Context Transformers: Stop")
    # load the vocabulary
    with open("vocab.txt", "rb") as fp:
        vocabulary = pickle.load(fp)

    print("Computing Bag of Words Vectors: Start")

    def build_features(doc):
        vector = np.zeros((1, len(vocabulary)), dtype=np.int64)
        for w in word_tokenize(doc):
            for i, word in enumerate(vocabulary):
                if word == w:
                    vector[0][i] += 1
        return vector

    bag_vectors = test_data["Text"].apply(build_features)
    feature_vectors = np.zeros((test_data.shape[0], len(vocabulary)),
                               dtype=np.int64)
    for pos, index in enumerate(test_data.index.values):
        feature_vectors[pos, :] = bag_vectors[index]
    cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))]
    for col_index, col in enumerate(cols):
        test_data[col] = feature_vectors[:, col_index].reshape(
            test_data.shape[0], 1)

    print("Computing Bag of Words Vectors: Stop")

    print("Computing Location Transformers: Start")

    test_data["location_page_nr"] = test_data["page_nr"].apply(
        lambda x: 100 if x >= 50 else x)
    test_data["location_line_nr"] = test_data["line_nr"].apply(
        lambda x: 100 if x >= 50 else x)

    print("Computing Location Transformers: Stop")

    print("Loading Model...")
    model = tf.keras.models.load_model('model_candidate_filter.h5')
    model.compile(loss=tf.keras.losses.mean_squared_error,
                  optimizer='adam',
                  metrics=['accuracy'])
    print("Loaded Model Successfully!")

    X_test = test_data.drop([
        "candidate", "Text", "label", "line_after", "line_at", "line_before",
        "page_nr", "line_nr"
    ],
                            axis=1)

    X_test = (X_test - X_test.mean(axis=0)) / X_test.std(axis=0)
    yHat_proba = model.predict(X_test)
    yHat = np.copy(yHat_proba)
    yHat[yHat <= 0.5] = 0
    yHat[yHat > 0.5] = 1

    print("Storing Results in .csv file")

    confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1]))
    for i in range(0, yHat_proba.shape[0]):
        if yHat_proba[i] <= 0.5:
            confidence[i] = 1 - yHat_proba[i]
        else:
            confidence[i] = yHat_proba[i]

    results_data_frame = pd.DataFrame(
        columns=["Predictions", "Confidence Level"], index=test_data.index)
    results_data_frame["Predictions"] = yHat.astype(np.int64).ravel()
    results_data_frame["Confidence Level"] = np.around(confidence, decimals=4)
    results_data_frame.to_csv("Results_predictions_confidence_run.csv",
                              encoding='utf-8',
                              header=True,
                              index=True)
Exemple #34
0
 def count_verbs(self, s0):
     tagger = PerceptronTagger()
     s0_tags = tagger.tag(s0)
     V_s0 = [values[0] for values in s0_tags if values[1] == 'VBP']
     return len(V_s0)
Exemple #35
0
def score_dataset(predicted_text, tagged_text):
    scores = []
    for sent1, sent2 in zip(predicted_text, tagged_text):
        scores.append(score(sent1, sent2))
    average_score = sum(scores) / len(predicted_text)
    return average_score


# Tagged text remains the same from the Penn Treebank
tagged_text = list(treebank.tagged_sents())

# 1. Perceptron Tagger
from nltk.tag import PerceptronTagger
data = list(data)
tagger = PerceptronTagger()
predicted_text = []
for sent in data:
    predicted_text.append(tagger.tag(sent))

perceptron_score = score_dataset(predicted_text, tagged_text)
# 1.804

# 2. nltk.Pos_tag Tagger
from nltk import pos_tag
predicted_text = []
for sent in data:
    predicted_text.append(pos_tag(sent))

nltk_score = score_dataset(predicted_text, tagged_text)
# 1.804