Example #1
0
def tokenization(corpus, stop_words=nltk.corpus.stopwords.words('portuguese')):
    '''Input : corpus é uma Serie de corpusumentos(frases)
       Output : Uma lista de listas com palavras 
    
    stop_words : lista de palavras que devem ser removidas
    '''

    #Tokenizacao
    spacetok = SpaceTokenizer()
    corpus = [spacetok.tokenize(phrases) for phrases in corpus]

    #stopwords
    if (stop_words != None):
        tmp_corpus = list()
        tmp_words = list()

        for phrases in corpus:
            for word in phrases:
                if (word not in stop_words):
                    tmp_words.append(word)
                else:
                    pass
            tmp_corpus.append(tmp_words)
            tmp_words = list()

        corpus = tmp_corpus
    else:
        pass

    return corpus
Example #2
0
def displayPageView(request):
    mycursor.execute('TRUNCATE table logs_c')
    filePath = request.GET['input-file']
    filePath = "C:/Users/Rhishabh/Documents/mithi hackathon/" + filePath
    log = readfile(filePath)

    line = log.readline()
    tk = SpaceTokenizer()
    tokens = tk.tokenize(line)
    while line:
        tokens = tk.tokenize(line)
        process(tokens)
        line = log.readline()

    mydb.commit()
    
    result1 = query_1()
    result2 = query2()
    result3 = query3()
    result4 = query4()
    result5 = query5()
    result7 = query7()

    # mydb.close()
    temp = [['test', 'test'], ['test', 'test']]
    test = 'sdsds'
    return render(request, 'display.htm', {'ipfile': filePath, 'result1': result1, 'result2': result2, 'result3': result3, 'result4': result4, 'result5': result5, 'result7': result7})
Example #3
0
def tokenize(s):
    out = []
    tokens = SpaceTokenizer().tokenize(s)
    for w in tokens:
        if w[:1] == "\n":
            out.append("\n")
            out.append(w[1:])
        else:
            out.append(w)
    return out
Example #4
0
def extract_name(tweet):
    token = SpaceTokenizer()
    toks = token.tokenize(tweet)
    pos = pos_tag(toks)
    chunked_nes = ne_chunk(pos)
    nes = [
        ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes
        if isinstance(ne, nltk.tree.Tree)
    ]
    return nes
Example #5
0
def read_sent(sent):
    sent = SpaceTokenizer().tokenize(sent)
    start = int(sent[0])
    end = int(sent[1])
    dataset = int(sent[2])
    sent = sent[4:]
    labels = [0] * len(sent)
    if dataset != 0:
        labels[start:end + 1] = [1] * (end + 1 - start)
    return [(sent[i], str(labels[i])) for i in range(len(sent))]
Example #6
0
def analyze_line(line):
    tokens = pos_tag(SpaceTokenizer().tokenize(line))

    names = []
    for token in tokens:
        if token[1] == 'NNP':
            names.append(re.sub('[' + string.punctuation + ']', '', token[0]))

    return {
        "names": names,
        "sentiment": SentimentIntensityAnalyzer().polarity_scores(line)
    }
Example #7
0
def fun_1_1_5():
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from nltk.tokenize import regexp_tokenize
    tokenizer = RegexpTokenizer("[\w]+")
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    print "regexp_tokenizer:", regexp_tokenize(
        "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+")
    # 通过空格来执行切分
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    # 筛选以大写字母开头的单词
    sent = " She secured 90.56 % in class X \n. She is a meritorious student"
    capt = RegexpTokenizer('[A-Z]\w+')
    print "RegexpTokenizer:", capt.tokenize(sent)
    # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的
    from nltk.tokenize import BlanklineTokenizer
    print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent)
    # 字符串的切分可以通过空格、间隔、换行等来完成
    from nltk.tokenize import WhitespaceTokenizer
    print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent)
    # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其
    # 切分为字母与非字母字符
    from nltk.tokenize import WordPunctTokenizer
    print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent)
    # 使用 split()方法进行切分
    print "split():", sent.split()
    print "split(' '):", sent.split(' ')
    print "split('\n'):", sent.split('\n')
    # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分
    from nltk.tokenize import LineTokenizer
    print "LineTokenizer:", LineTokenizer().tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent)
    # SpaceTokenizer 与 sent.split('')方法的工作原理类似
    from nltk.tokenize import SpaceTokenizer
    print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent)
    # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符
    # 在语句中的位置和偏移量
    print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent))
    # 给定一个标识符的序列,则可以返回其跨度序列
    from nltk.tokenize.util import spans_to_relative
    print "位置和偏移:", list(
        spans_to_relative(WhitespaceTokenizer().span_tokenize(sent)))
    # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量:
    from nltk.tokenize.util import string_span_tokenize
    print "标识符序列:", list(string_span_tokenize(sent, " "))
Example #8
0
    def get_vocab(self, start_index=2, min_count=10):
        text = ''.join(list(self.publications['full_text'].values))
        all_words = SpaceTokenizer().tokenize(text + text.lower())
        vocab = Counter(all_words).most_common()
        vocab_out_json = {}
        for items in vocab:
            if items[1] > min_count:
                vocab_out_json[items[0].decode(
                    'utf-8', 'replace')] = len(vocab_out_json) + start_index

        print(len(vocab) - len(vocab_out_json), ' words are discarded as OOV')
        print(len(vocab_out_json), ' words are in vocab')

        with codecs.open(self.outdir + 'vocab.json', 'wb') as vocabfile:
            json.dump(vocab_out_json, vocabfile)
Example #9
0
def read_doc(doc, labels):
    doc = SpaceTokenizer().tokenize(doc.strip())
    # doc = doc.strip().split()
    labels = labels.strip().split('|')
    labels = [la.split() for la in labels]
    for i in range(len(labels)):
        for j in range(len(labels[i])):
            labels[i][j] = int(labels[i][j])

    res_labels = [0] * len(doc)
    for la in labels:
        if la[2] != 0:
            start = la[0]
            end = la[1]
            res_labels[start:end + 1] = [1] * (end + 1 - start)
    return [(doc[i], str(res_labels[i])) for i in range(len(doc))]
Example #10
0
 def space_tokenizer(text, strip=None):
     ''' Only " " blank character
     Same as s.split(" ")
     >>> s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks."
     >>> SpaceTokenizer().tokenize(s)
     ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', 
     "It's", 'inexpensive.', 'Free-for-all.', 
     'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
     >>> s.split(' ')
     ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', 
     "It's", 'inexpensive.', 'Free-for-all.', 
     'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
     >>>'''
     
     for token in SpaceTokenizer().tokenize(text):
         if token not in patterns.PUNCTUATION and not token.isspace():
             yield token.strip(strip)
Example #11
0
def tokenizarPorTipo():
    cadena = "Sorry, I can't go to the meeting.\n"
    print("TreebankWordTokenizer - 1")
    print("WhitespaceTokenizer - 2")
    print("SpaceTokenizer - 3")
    print("WordPunctTokenizer - 4")
    num = input("Introduzca un tokenizer: ")
    if num == "1":
        tokenizer = TreebankWordTokenizer()
    elif num == "2":
        tokenizer = WhitespaceTokenizer()
    elif num == "3":
        tokenizer = SpaceTokenizer()
    elif num == "4":
        tokenizer = WordPunctTokenizer()
    else:
        return

    tokens = tokenizer.tokenize(cadena)
    print(tokens)
Example #12
0
from nltk.tag import pos_tag
import nltk.tokenize
from nltk.corpus import cmudict
from wordgen import gen_word
from nltk import pos_tag, ne_chunk
from nltk.tokenize import SpaceTokenizer

sentence = "who is Mahatma Gandhi visiting I'm HIS PRETTY GIRLFRIEND a Denny's McDonalds in broad daylight Shtruus"
tokenizer = SpaceTokenizer()
toks = tokenizer.tokenize(sentence)
pos = pos_tag(toks)
chunked_nes = ne_chunk(pos)
print chunked_nes
nes = [
    ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes
    if isinstance(ne, nltk.tree.Tree)
]

print nes
'''
qry = "who is Mahatma Gandhi"
tokens = nltk.tokenize.word_tokenize(qry)
pos = nltk.pos_tag(tokens)
sentt = nltk.ne_chunk(pos, binary = False)
print sentt
person = []
for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'):
    for leave in subtree.leaves():
        person.append(leave)
print "person=", person
    
Example #13
0
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
from nltk import CFG,ChartParser
from nltk.tokenize import SpaceTokenizer
grammar = CFG.fromstring("""
  S -> NP VP
  NP -> Det N
  VP -> IV
  Det -> 'the'
  N -> 'man'
  IV -> 'walks'
  """)
#>>> grammar
#<Grammar with 14 productions>
#>>> grammar.start()
#S
#>>> grammar.productions()
#[S -> NP VP, NP -> Det N, VP -> IV, Det -> 'the', N -> 'man', IV -> 'walks']
parser = ChartParser(grammar)
parses = parser.parse_all(SpaceTokenizer().tokenize("the man walks"))
#>>> parses
#[Tree('S', [Tree('NP', [Tree('Det', ['the']), Tree('N', ['man'])]), Tree('VP', [Tree('IV', ['walks'])])])]
of comments. Words are stemmed, then each occurence of 
a word is counted. Each word is only counted once per
comment. Out of the 300 most frequent words in the toxic
category and the 300 most frequent words in the clean
category, the vocabulary is selected as the words in the
toxic category but not in the clean category. 

@author: Alex
"""

from nltk.tokenize import SpaceTokenizer
from sklearn.feature_extraction import text
import pandas as pd
from nltk.stem.porter import PorterStemmer

s_tok = SpaceTokenizer()
porter = PorterStemmer()

additional_stopwords = ['wiki', 'utc', 'wikipedia']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
stemmed_stop_words = set([porter.stem(w) for w in stop_words])


def makeFrequencyDict(comments):
    '''
    Make a dictionary of the frequency a word appears in the corpus.
    Frequencies are normalized by number of example comments.
    '''
    n_examples = len(comments)
    freqDict = {}
 def __init__(self):
     self.tokenizer = SpaceTokenizer()
Example #17
0
        protocol = tokens[5][1:-1]
        status = tokens[6]
        size = tokens[7][:-1]
#     print(ip + ' ' + date_time + ' ' + method + ' ' + url + ' ' + protocol + ' ' + status + ' ' + size)

    val = (ip, date_time, method, url, protocol, status, size)
    mycursor.execute(sql, val)


# Type 0 -> Tab Seperated (Server 1)
# Type 1 -> Space Seperated (Server 2)

log = readfile("access_log")

line = log.readline()
tk = SpaceTokenizer()
tokens = tk.tokenize(line)

while line:
    tokens = tk.tokenize(line)
    process(tokens)
    line = log.readline()

mydb.commit()
print("records inserted.")

# Top client ip addresses by number of requests
sql = "SELECT IP, count(*) FROM logs_c GROUP BY IP ORDER BY count(*) DESC LIMIT 5"
mycursor.execute(sql)
results = mycursor.fetchall()
Example #18
0
    def get_test_docs(self):
        test_doc_ids = open(self.outdir + '/test_doc_ids', 'w+')
        test_docs = open(self.outdir + '/test_docs', 'w+')
        golden_data = open(self.outdir + '/test_doc_gold', 'w+')

        test_doc_list = []
        for doc in set(self.data_set_citations['publication_id']):
            if np.random.randint(
                    0, 100) < 10 and doc not in self.zero_shot_doc_ids:
                test_doc_list.append(doc)
                test_doc_ids.write(str(doc) + '\n')
        logger.info(str(len(test_doc_list)) + ' test docs selected')

        pub_ids = list(self.data_set_citations['publication_id'])
        pos_tokens = 0
        neg_tokens = 0
        #to locate lines with relevant pubs
        for pub_id in test_doc_list:
            pub_text = self.full_text[str(pub_id) + '.txt']
            test_docs.write(pub_text + '\n')
            pub_text_tokens = list(SpaceTokenizer().tokenize(pub_text))
            pub_text_spans = list(SpaceTokenizer().span_tokenize(pub_text))
            cur_pos_tokens = 0
            cur_neg_tokens = len(pub_text_tokens)

            res_line = []
            rows = [pub_ids.index(i) for i in pub_ids if i == pub_id]
            for idx in rows:
                d_row = self.data_set_citations.loc[idx]
                for mention_text in d_row['mention_list']:
                    mention_text = re.sub('\d', ' ', mention_text)
                    # mention_text = re.sub('[^ ]- ', '', mention_text)
                    mention_text_spans = list(
                        SpaceTokenizer().span_tokenize(mention_text))

                    index_finder_lower = findall_lower(mention_text, pub_text)
                    found_indices = [idx for idx in index_finder_lower]

                    for find_index in found_indices:
                        try:
                            if find_index != -1:
                                new_mention_text_spans = [
                                    (indices[0] + find_index,
                                     indices[1] + find_index)
                                    for indices in mention_text_spans
                                ]
                                cur_pos_tokens += len(mention_text_spans)

                                res_line.append(
                                    (pub_text_spans.index(
                                        new_mention_text_spans[0]),
                                     pub_text_spans.index(
                                         new_mention_text_spans[-1]),
                                     d_row['data_set_id'],
                                     d_row['publication_id']))
                        except:
                            pass
            res_line = list(set(res_line))
            if len(res_line) == 0:
                # no mentions at all
                res_line.append((-1, -1, 0, pub_id))
            i = 0
            for c in res_line:
                if i > 0:
                    golden_data.write(' | ' + str(c[0]) + ' ' + str(c[1]) +
                                      ' ' + str(c[2]) + ' ' + str(c[3]))
                else:
                    golden_data.write(
                        str(c[0]) + ' ' + str(c[1]) + ' ' + str(c[2]) + ' ' +
                        str(c[3]))
                i += 1
            golden_data.write('\n')
            pos_tokens += cur_pos_tokens
            neg_tokens += (cur_neg_tokens - cur_pos_tokens)

        test_doc_ids.close()
        test_docs.close()
        golden_data.close()

        logger.info(str(pos_tokens) + " pos tokens added.")
        logger.info(str(neg_tokens) + " neg tokens added.")
        logger.info("neg token percentage: {}".format(
            neg_tokens * 100 / (pos_tokens + neg_tokens)))
Example #19
0
def text_pre_processing(text,
                        remove_number=True,
                        stop_word=True,
                        stop_word_language='english',
                        remove_punctuation=True):
    # ---------------------------------------------
    # Patterns
    results_chunk = ''
    results_named_entitiy = ''

    patterns1 = r'@[A-Za-z0-9_]+'
    pattterns2 = r'https?://[^ ]+'
    combined_patterns = r'|'.join((patterns1, pattterns2))
    www_patterns = r'www.[^ ]+'
    negations_dic = {
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "wouldn't": "would not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "can't": "can not",
        "couldn't": "could not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not"
    }
    negations_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) +
                                   r')\b')

    # ---------------------------------------------
    # convert to lower case
    results = str(text)

    # ---------------------------------------------
    # Text Cleaning
    results = re.sub(combined_patterns, '', results)
    results = re.sub(www_patterns, '', results)
    results = results.lower()
    results = negations_pattern.sub(lambda x: negations_dic[x.group()],
                                    results)
    results = re.sub("[^a-zA-Z]", " ", results)

    results = results.replace("(<br/>)", "")
    results = results.replace('(<a).*(>).*(</a>)', '')
    results = results.replace('(&amp)', '')
    results = results.replace('(&gt)', '')
    results = results.replace('(&lt)', '')
    results = results.replace('(\xa0)', ' ')

    # ---------------------------------------------
    if (remove_number) & (results != ''):
        results = re.sub(r'\d+', '', results)

    # ---------------------------------------------
    if remove_punctuation & (results != ''):
        translator = str.maketrans('', '', string.punctuation)
        results = results.translate(translator)

    # ---------------------------------------------
    # Remove whitespaces
    results = results.strip()

    # ---------------------------------------------
    # Line Tokenize
    if results != '':
        line_tokenizer = LineTokenizer()
        results = line_tokenizer.tokenize(results)
        results = list(filter(None, results))
        results = results[0]

    # ---------------------------------------------
    # Tab Tokenize
    if results != '':
        tab_tokenizer = TabTokenizer()
        results = tab_tokenizer.tokenize(results)
        results = list(filter(None, results))
        results = results[0]

    # ---------------------------------------------
    # Space Tokenizer
    if results != '':
        space_toknizer = SpaceTokenizer()
        results = space_toknizer.tokenize(results)
        results = list(filter(None, results))
        results = ' '.join([w for w in results])

    # -----------------------------------------------
    # Lemmatization using NLTK
    if results != '':
        lemmatizer_of_text = WordNetLemmatizer()
        word_list = word_tokenize(results)
        results = ' '.join([
            lemmatizer_of_text.lemmatize(w, get_word_net_pos_tag(w))
            for w in word_list
        ])

    # ---------------------------------------------
    # Stemming using NLTK
    if results != '':
        stemmer = PorterStemmer()
        if type(results) == list:
            results = ' '.join(str(w) for w in results)
        results = word_tokenize(str(results))
        results = [stemmer.stem(word) for word in results]
        results = ' '.join(str(w) for w in results)

    # ---------------------------------------------
    # Remove Stop Words
    if stop_word & (results != ''):
        nltk.download('stopwords')
        stop_words = set(stopwords.words(stop_word_language))
        word_tokens = word_tokenize(results)
        results = ' '.join(str(w) for w in word_tokens if not w in stop_words)

    # ---------------------------------------------
    # Chunking of the input, will be used ofr coloring of the text
    if results != '':
        result_str = TextBlob(results)
        reg_exp = 'NP: { < DT >? < JJ > * < NN >}'
        rp = nltk.RegexpParser(reg_exp)
        results_chunk = rp.parse(result_str.tags)
    # results_chunk.draw()

    # ---------------------------------------------
    # Named Entity Recognition
    if results != '':
        results_named_entitiy = ne_chunk(pos_tag(word_tokenize(results)))

    return results, results_chunk, results_named_entitiy
print tokenizer.tokenize(text)

print regexp_tokenize(text, pattern='\w+|\$[\d\.]+\S+')

# Tokenize whitespace
tokenizer = RegexpTokenizer('\s+', gaps=True)
print tokenizer.tokenize(text)

# Select only words starting with capital letters
capt = RegexpTokenizer('[A-Z]\w+')
print capt.tokenize(text2)

print BlanklineTokenizer().tokenize(text2)

print WhitespaceTokenizer().tokenize(text2)

print LineTokenizer(blanklines='keep').tokenize(text2)
print LineTokenizer(blanklines='discard').tokenize(text2)

# SpaceTokenizer works similar to .split('')
print SpaceTokenizer().tokenize(text2)

# Returns the sequence of tuples that are offsets of the tokens
# in a sentence:
print list(WhitespaceTokenizer().span_tokenize(text2))

# Returns the sequence of relative spans
print list(spans_to_relative(WhitespaceTokenizer().span_tokenize(text2)))

# Returns the offsets of tokens in text2 by splitting at each incidence of the separator:
print list(string_span_tokenize(text2, ""))
Example #21
0
import nltk
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
from nltk.tokenize import SpaceTokenizer
print(SpaceTokenizer().tokenize(sent))
Example #22
0
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d,
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
Example #23
0
    def get_train_data(self, MAX_LENGTH=60, full_neg=True):
        ## MAX_LENGTH: max length of segments to be split into
        ## neg ratio: how many neg data to use (out of 100), should be an integer
        ## full_neg: whether to extract all neg data

        max_length_token = MAX_LENGTH

        ## avoid taking docs from test set
        test_doc_ids = []
        zero_shot_doc_ids = []
        with open('../data/all_test_docs/test_doc_ids') as f:
            fl = f.readlines()
            test_doc_ids = [int(line.strip()) for line in fl]

        with open('../data/all_test_docs/zero_shot_doc_ids') as f:
            fl = f.readlines()
            zero_shot_doc_ids = [int(line.strip()) for line in fl]

        train_doc_len = len(
            set(self.data_set_citations['publication_id'].values)) - len(
                test_doc_ids) - len(zero_shot_doc_ids)
        logger.info('sample from ' + str(train_doc_len) + ' train docs')

        pos_count = 0
        neg_count = 0
        pos_tokens = 0
        neg_tokens = 0

        sampled = []
        with codecs.open(self.outdir + 'pos_data',
                         'w') as pos_data, codecs.open(
                             self.outdir + 'neg_data', 'w') as neg_data:
            for index, row in self.data_set_citations.iterrows():
                pub_id = row['publication_id']
                if pub_id in zero_shot_doc_ids or pub_id in test_doc_ids:
                    continue

                if pub_id in sampled:
                    continue
                else:
                    sampled.append(pub_id)

                pub_ids = list(self.data_set_citations['publication_id'])
                rows = [pub_ids.index(i) for i in pub_ids if i == pub_id]
                mention_list = []
                for r in rows:
                    d_row = self.data_set_citations.loc[r]
                    mention_list.extend(d_row['mention_list'])
                mention_list = set(mention_list)
                logger.info('pub id: {}, mentions: {}'.format(
                    pub_id, len(mention_list)))

                sample_text = self.full_text[str(pub_id) + '.txt']
                sample_text_tokens = list(
                    SpaceTokenizer().tokenize(sample_text))
                sample_text_spans = list(
                    SpaceTokenizer().span_tokenize(sample_text))

                pos_splits = []
                for mention_text in mention_list:
                    mention_text = re.sub('\d', ' ', mention_text)
                    # mention_text = re.sub('[^ ]- ', '', mention_text)
                    mention_text_spans = list(
                        SpaceTokenizer().span_tokenize(mention_text))

                    index_finder_lower = findall_lower(mention_text,
                                                       sample_text)

                    all_found_indices = [idx for idx in index_finder_lower]

                    for find_index in all_found_indices:
                        try:
                            if find_index != -1:
                                # logger.info('Found: '+mention_text)
                                new_mention_text_spans = [
                                    (indices[0] + find_index,
                                     indices[1] + find_index)
                                    for indices in mention_text_spans
                                ]
                                #write to training sample pointers here

                                for splits in range(
                                        len(sample_text_tokens) //
                                        max_length_token - 1):
                                    if sample_text_spans.index(new_mention_text_spans[0]) > splits*(max_length_token) and \
                                      sample_text_spans.index(new_mention_text_spans[-1]) < (splits+1)*(max_length_token):

                                        pos_splits.append(splits)
                                        pos_count += 1
                                        pos_tokens += len(
                                            new_mention_text_spans)
                                        neg_tokens += (
                                            MAX_LENGTH -
                                            len(new_mention_text_spans))

                                        #TODO Wrapper over full data reader
                                        pos_data.write(
                                            str(
                                                sample_text_spans.index(
                                                    new_mention_text_spans[0])
                                                - splits *
                                                (max_length_token)) + ' ' +
                                            str(
                                                sample_text_spans.index(
                                                    new_mention_text_spans[-1])
                                                - splits *
                                                (max_length_token)) + ' ' +
                                            str(row['data_set_id']) + ' ' +
                                            str(row['publication_id']) + ' ' +
                                            ' '.join(sample_text_tokens[
                                                splits * (max_length_token):
                                                (splits + 1) *
                                                (max_length_token) + 1]) +
                                            '\n')
                            else:
                                # print ('Annotation Error: Annotated gold standards not correct')
                                pass
                        except:
                            # print ('Indexing Logic Error: Some corner index case missed while parsing')
                            pass

                if not full_neg:
                    continue
                ## NOTE: index starts from 0
                ## -1 - 1 means no mention
                for splits in range(
                        len(sample_text_tokens) // (max_length_token) - 1):
                    if splits not in pos_splits:
                        neg_data.write(
                            str(-1) + ' ' + str(-1) + ' ' + str(0) + ' ' +
                            str(row['publication_id']) + ' ' +
                            ' '.join(sample_text_tokens[splits *
                                                        (max_length_token):
                                                        (splits + 1) *
                                                        (max_length_token)]) +
                            '\n')

                        neg_count += 1
                        neg_tokens += MAX_LENGTH

        logger.info(str(pos_count) + " mentions added.")
        logger.info(str(neg_count) + " no mentions added.")
        logger.info(str(pos_tokens) + " pos tokens added.")
        logger.info(str(neg_tokens) + " neg tokens added.")
        logger.info("neg token percentage: {}".format(
            neg_tokens * 100 / (pos_tokens + neg_tokens)))
Example #24
0
#print(question[1])
selected_k = []
for r in range(len(ranked_q)):
    pos = nltk.pos_tag(ranked_q[r])
    selective_pos = ['NN', 'VB']
    selective_pos_words = []
    for word, tag in pos:
        if tag in selective_pos:
            selective_pos_words.append((word, tag))
    selected_k.append(selective_pos_words)
#print(selected_k[1][0])

# In[14]:

from nltk.tokenize import SpaceTokenizer
tm = SpaceTokenizer()
to_rank = []
key_words = []

for i in range(len(ranked_q)):
    yn = 0

    #ranked_q[i][yn]
    question[i] = untokenize(question[i])

    yy = "_____"
    to_rank.append(tm.tokenize(ranked_q[i][0]))
    print("Q:", question[i].replace(to_rank[i][len(to_rank[i]) // 2], yy))
    print('Ans - ', to_rank[i][len(to_rank[i]) // 2])
    #quita = question[i].index(to_rank[i][len(to_rank[i])//2])
Example #25
0
# -*- coding: utf-8 -*-

from nltk.tokenize import SpaceTokenizer

import sys

s = sys.argv[1].decode('utf-8')
dt = sys.argv[2]

#print "dt = "+dt

#python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \n ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ"   ' 	'
tokens = []

if(dt == " "):
	tokens = SpaceTokenizer().tokenize(s)

#python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \t ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ"   '\t'
from nltk.tokenize import TabTokenizer


if(dt == '\\t'):
	print "dt = "+dt
	s = s.replace(u'\\t','\t')
	tokens = TabTokenizer().tokenize(s)

#python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \n ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ"   '\n'
from nltk.tokenize import LineTokenizer

if(dt == '\\n'):
	s = s.replace(u'\\n','\n')
        X_out.append(getEmbeddingOfSequence(args, seq))
    return X_out


def getEmbeddingOfSequence(args, sequence):
    list = []
    for i in sequence:
        if i in args.index2word:
            list.append(args.word2vec[args.index2word[i]])
        else:
            list.append(args.word2vec['<UK>'])
    #print list
    return list


stringTokenizer = SpaceTokenizer()
'''Other features over relationphrase
'''


def get_other_relationphrase_features(args, x_1, relationphrase_complete,
                                      relationphrase_middle, entity_1,
                                      entity_2):

    if len(relationphrase_complete) < 4:
        return x_1

    #nltk.word_tokenize(sentence)
    relationphrase_complete_tokens = stringTokenizer.tokenize(
        relationphrase_complete)
    relationphrase_complete_postags = nltk.pos_tag(