Exemple #1
0
def fun_1_1_5():
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from nltk.tokenize import regexp_tokenize
    tokenizer = RegexpTokenizer("[\w]+")
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    print "regexp_tokenizer:", regexp_tokenize(
        "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+")
    # 通过空格来执行切分
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    # 筛选以大写字母开头的单词
    sent = " She secured 90.56 % in class X \n. She is a meritorious student"
    capt = RegexpTokenizer('[A-Z]\w+')
    print "RegexpTokenizer:", capt.tokenize(sent)
    # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的
    from nltk.tokenize import BlanklineTokenizer
    print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent)
    # 字符串的切分可以通过空格、间隔、换行等来完成
    from nltk.tokenize import WhitespaceTokenizer
    print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent)
    # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其
    # 切分为字母与非字母字符
    from nltk.tokenize import WordPunctTokenizer
    print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent)
    # 使用 split()方法进行切分
    print "split():", sent.split()
    print "split(' '):", sent.split(' ')
    print "split('\n'):", sent.split('\n')
    # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分
    from nltk.tokenize import LineTokenizer
    print "LineTokenizer:", LineTokenizer().tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent)
    # SpaceTokenizer 与 sent.split('')方法的工作原理类似
    from nltk.tokenize import SpaceTokenizer
    print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent)
    # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符
    # 在语句中的位置和偏移量
    print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent))
    # 给定一个标识符的序列,则可以返回其跨度序列
    from nltk.tokenize.util import spans_to_relative
    print "位置和偏移:", list(
        spans_to_relative(WhitespaceTokenizer().span_tokenize(sent)))
    # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量:
    from nltk.tokenize.util import string_span_tokenize
    print "标识符序列:", list(string_span_tokenize(sent, " "))
Exemple #2
0
def line_tokenizer(text):
    """ split the sentence into lines.

    :param text: sentence
    :return: list of lines
    """
    lines = LineTokenizer(blanklines='discard').tokenize(text)
    return lines
Exemple #3
0
    def __init__(self, name, text, bow, region, gen_region=False):
        self.name = name
        self.text = text
        self.bow = bow
        self.region = region

        if gen_region:
            lines = LineTokenizer().tokenize(self.text)
            self.region = lines[0].split('EN ')[-1]
 def sentOnDate(self, day, month, year):
     sp = []
     y = "/"
     name_pattern = str(month) + y + str(day) + y + str(year)
     f = open(self.myfile, encoding="utf8")
     raw = f.read()
     sentences = LineTokenizer(blanklines='keep').tokenize(raw)
     for i in sentences:
         if (re.search(name_pattern, i)):
             print(i)
 def visWithoutDate(self, number):
     sp = []
     x = input('enter friend name')
     y = input('enter your name')
     name_pattern = "\d*\/\d*\/\d*, \d*:\d*\s-\s(" + x + ": |" + y + ": )|<Media omitted>|\.|\-"
     f = open(self.myfile, encoding="utf8")
     raw = f.read()
     sentences = LineTokenizer(blanklines='keep').tokenize(raw)
     for i in sentences:
         sp.append(re.sub(name_pattern, '', i))
     print(sp)
     myplot = nltk.FreqDist(sp)
     myplot.plot(number)
def get_sentences(article):
    """
    In:
        article = Cleaned wikipedia article text

    Out:
        sentences = List of sentences in wikipedia article
    """
    lines = LineTokenizer(blanklines='discard').tokenize(
        article.replace("<br>", "\n"))
    sentences_by_lines = [sent_tokenize(line) for line in lines]
    sentences = [sentence for line in sentences_by_lines for sentence in line]

    return sentences
 def take_ngrams_by_topic_from_file(self, ngram_directory, ngram_file):
     corpus = \
         TaggedCorpusReader(ngram_directory,
                            ngram_file,
                            sent_tokenizer=LineTokenizer(blanklines='discard'),
                            encoding='utf-8')
     corpus_paras = corpus.paras()[:]
     k = corpus_paras[::2]
     for i in range(2):
         k = list(chain(*k))
     v = corpus_paras[1::2]
     ngrams_by_topic_from_file = \
         {k.encode('utf-8'): list(set(chain(*v)))
            for k, v in dict(izip(k, v)).items()}
     return ngrams_by_topic_from_file
Exemple #8
0
def word_tokenizer(text):
    """Split the sentence into lines.

    <S> <S>+BSTag
    2007-5-2Dişlerimiz 2007-5-2Dişlerimiz[Unknown]
    Arasındaki ara[Adj]-[Noun]+[A3sg]+SH[P3sg]+NDA[Loc]-ki[Adj+Rel] : 22.3828125 Aras[Noun]+[Prop]+[A3sg]+
        SH[P3sg]+NDA[Loc]-ki[Adj+Rel] : 19.3212890625 Aras[Noun]+[Prop]+[A3sg]+Hn[P2sg]+
        NDA[Loc]-ki[Adj+Rel] : 23.994140625 ara[Noun]+[A3sg]+SH[P3sg]+NDA[Loc]-ki[Adj+Rel] : 11.2919921875
    Ceset ceset[Noun]+[A3sg]+[Pnon]+[Nom] : 10.6982421875
    </S> </S>+ESTag

    :param text: sentence
    :return: list of lines
    """
    lines = LineTokenizer(blanklines='discard').tokenize(text)
    return lines
 def visualOnDay(self, day, month, year, number):
     sp = []
     y = "/"
     sp2 = []
     p = input('enter friend name')
     k = input('enter your name')
     date_pattern = str(month) + y + str(day) + y + str(year)
     f = open(self.myfile, encoding="utf8")
     raw = f.read()
     sentences = LineTokenizer(blanklines='keep').tokenize(raw)
     for i in sentences:
         if (re.search(date_pattern, i)):
             sp.append(i)
     pattern2 = "\d*\/\d*\/\d*, \d*:\d*\s-\s(" + p + ": |" + k + ": )|<Media omitted>|\.|\-|\s"
     for i in sp:
         sp2.append(re.sub(pattern2, '', i))
     myplot = nltk.FreqDist(sp2)
     myplot.plot(number)
 def generate_corpus_from_segmented_reports(self):
     re = ReportEnviroments()
     new_corpus_of_segmented_reports = TaggedCorpusReader(
         re.segmented_reports_corpus_path,
         '.*',
         sent_tokenizer=LineTokenizer(blanklines='discard'),
         encoding='utf-8')
     raw_segmented_reports = []
     for i in range(len(new_corpus_of_segmented_reports.fileids())):
         raw_segmented_reports.append(
             new_corpus_of_segmented_reports.sents(
                 fileids=new_corpus_of_segmented_reports.fileids()[i]))
     cut_of_segmented_reports = []
     topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED']
     for i in range(len(raw_segmented_reports)):
         cut_of_segmented_reports.append(
             raw_segmented_reports[i]
             [raw_segmented_reports[i].index([topics[0].decode('utf-8')]):
              raw_segmented_reports[i].index([topics[-1].decode('utf-8')]) +
              1])
     return cut_of_segmented_reports, topics
Exemple #11
0
class DataFilter:
    def __init__(self):
        self.bl_tokenizer = LineTokenizer()
        self.re_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        self.stemmer = SnowballStemmer('english')
        self.NGRAM_RANGE = 3

    def rm_blanklines(self, text):
        return " ".join([word for word in self.bl_tokenizer.tokenize(text)])

    def rm_stopwords(self, text):
        return " ".join([
            word for word in word_tokenize(text)
            if word.lower() not in stopwords.words()
        ])

    def ngram_tokenize(self, text):
        return [
            word for sent in sent_tokenize(text) for word in ngrams(
                self.re_tokenizer.tokenize(sent), self.NGRAM_RANGE)
        ]

    def tokenize_(self, text):
        return [
            word for sent in sent_tokenize(text)
            for word in self.re_tokenizer.tokenize(sent)
        ]

    def tokenize_and_stem(self, text):
        return [
            self.stemmer.stem(word) for sent in sent_tokenize(text)
            for word in self.re_tokenizer.tokenize(sent)
        ]

    def rm_nonwords(self, text):
        return " ".join([
            word for word in word_tokenize(text)
            if word.lower() in words.words()
        ])
Exemple #12
0
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

lTokenizer = LineTokenizer()
print(
    "Line tokenizer output :",
    lTokenizer.tokenize(
        "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. \nAnd I will have my vengeance, in this life or the next."
    ))

rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("Space Tokenizer output :", sTokenizer.tokenize(rawText))

print("Word Tokenizer output :", word_tokenize(rawText))

tTokenizer = TweetTokenizer()
print("Tweet Tokenizer output :",
      tTokenizer.tokenize("This is a cooool #dummysmiley: :-) :-P <3"))
Exemple #13
0
# <codecell>


# <codecell>

import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import LineTokenizer
from nltk.corpus import state_union
fw = open("064.txt")
text = remove_non_ascii_2(fw.read())

import nltk.data
sents = LineTokenizer(blanklines=u'discard').tokenize(text.strip())
for i in sents:
    words = nltk.word_tokenize(i)
    pos = nltk.pos_tag(words)
    sentt = nltk.ne_chunk(pos, binary = False)
    person_list = []
    person = []
    name = ""
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
        for leaf in subtree.leaves():
            person.append(leaf[0])
        if len(person) > 1: #avoid grabbing isolated surnames
            for part in person:
                name += part + ' '
            if name[:-1] not in person_list:
                person_list.append(name[:-1])
# In[6]:

# LineTokenizer
import nltk
from nltk.tokenize import LineTokenizer

# LineTokenizer can be used to split strings containing newline characters

# In[7]:

s = "I love kites.\nI like cricket.\nI like football.\n"

print("Sentences: ")
print(s)
print("LineTokenizer...")
print(LineTokenizer().tokenize(s))
print("\nword_tokenizer... ")
for sent in LineTokenizer().tokenize(s):
    print(word_tokenize(sent))

# In[8]:

from nltk.tokenize import RegexpTokenizer

# RegexpTokenizer allows us to provide regular expressions as delimiters
# The material between the tokens is discarded.

# In[9]:

s = "Petrol price has gone upto Rs.75.89 on 01/02/2017. John and Mrs. Thomas are thinking of using electric scooters."
tokenizer = RegexpTokenizer('Rs\.[\d]+\.[\d]+')
def line_tokenizer(data, blanklines):
    ''' Tokenize the text on line level ie, after \n'''
    tokenizer = LineTokenizer(blanklines=blanklines)
    return tokenizer.tokenize(data)
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Time       : 2020/7/11 17:37
# @Author     : 代登辉
# @Email      : [email protected]
# @File       : tokenizer.py
# @Software   : PyCharm
# @Description: 分词

# 导入相应库
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

text = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \
       "loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. " \
       "\nAnd I will have my vengeance, in this life or the next. "
ITokenizer = LineTokenizer()
print("按照换行分词 ", ITokenizer.tokenize(text))

rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("按照空格符分词 :", sTokenizer.tokenize(rawText))  # 表达符号和单词连在一起
print("按照单词分词 :", word_tokenize(rawText))  # 表达符号和单词分开

tweet = "This is a cooool #dummysmiley: :-) :-P <3"
tTokenizer = TweetTokenizer()
print("处理特殊字符 ", tTokenizer.tokenize(tweet))
Exemple #17
0
def sentence_tokenizer(corpus):
    line_tokenizer = LineTokenizer()
    song_lines = line_tokenizer.tokenize(corpus)
    return song_lines
Exemple #18
0
para = "Hola amigos. Gracias por ver este video. Saludos"       # Defines the text to tokenize
tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')   # Loads the spanish sentence tokenizer
print (tokenizer.tokenize(para))                                # Tokenizes the text

# Tokenize based on lines, spaces or tweets (special class)
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

# Line tokenizer
longSentence = 'My name is Maximus Decimus Meridius, Commander of the Armies '\
'of the North, General of the Felix Legions, loyal servant to '\
'the true emperor, Marcus Aurelius. Father to a murdered son, '\
'husband to a murdered wife. And I will have my vengeance, in '\
'this life or the next.'

lTokenizer = LineTokenizer()
sentenceTokens = lTokenizer.tokenize(longSentence)
print (sentenceTokens)

# Space tokenizer
sTokenizer = SpaceTokenizer()
spaceTokens = sTokenizer.tokenize(longSentence)
print (spaceTokens)

# Tweet tokenizer
tweet = 'This is a coool #dummysmiley: :-) :) :-P <3'
tTokenizer = TweetTokenizer()
tTokens = tTokenizer.tokenize(tweet)
print ('Tweet tokenizer outpur:')
print (tTokens)
                unicode_list = (str(y) + '|' + str(m) +
                                '|') + u'**'.join(myList)
                clean_unicode = '\n' + unicode_list.replace(
                    '**', '\n' + str(y) + '|' + str(m) + '|')
                #print clean_unicode
                text.write(clean_unicode.encode('utf-8'))
            except KeyError:
                pass
        time.sleep(1)
text.close()
print 'finished getting keywords'

text = open('keywords' + daterange + '.txt', 'r').read()
freqlist = open('freqlist' + daterange + '.txt', 'wb')
lowertext = text.lower()
lines = LineTokenizer(blanklines='discard').tokenize(lowertext)
freq = nltk.FreqDist(lines)
#print freq
writer = csv.writer(freqlist,
                    delimiter='|',
                    lineterminator='\n',
                    quotechar='"',
                    quoting=csv.QUOTE_MINIMAL)
writer.writerows(freq.items())
freqlist.close()
print 'finished getting frequency distribution'

fileout = codecs.open('frequency' + daterange + '.csv', 'wb')

csv_out = csv.writer(fileout, lineterminator='\n', delimiter=',')
wr = UnicodeWriter(fileout,
 def printSentences(self):
     f = open(self.myfile, encoding="utf8")
     raw = f.read()
     sentences = LineTokenizer(blanklines='keep').tokenize(raw)
     print(sentences)
def lang_detection():
    danish_sent = []
    dutch_sent = []
    english_sent = []
    finnish_sent = []
    french_sent = []
    german_sent = []
    hungarian_sent = []
    italian_sent = []
    norwegian_sent = []
    portuguese_sent = []
    russian_sent = []
    spanish_sent = []
    swedish_sent = []
    turkish_sent = []

    print "Welcome! Please enter the names of the text file you'd like to analyze in the format [filename].txt"
    text1 = raw_input("Filename: ")
    with open(text1, 'r') as file:
        text = file.read()
        sent_text = LineTokenizer(blanklines='discard').tokenize(
            text)  # Split the text by lines
        # Now loop over each line and tokenize it separately
        for sentence in sent_text:
            tokenized_text = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(tokenized_text)

            languages_ratios = {}

            tokens = wordpunct_tokenize(sentence)
            words = [word.lower() for word in tokens]

            for language in stopwords.fileids():
                stopwords_set = set(stopwords.words(language))
                words_set = set(words)
                common_elements = words_set.intersection(stopwords_set)

                languages_ratios[language] = len(
                    common_elements)  # language "score"

                ratios = languages_ratios

                most_rated_language = max(ratios, key=ratios.get)

                language = most_rated_language

            if language == "danish":
                danish_sent.append(sentence)
                print language

            elif language == "dutch":
                dutch_sent.append(sentence)
                print language

            elif language == "english":
                english_sent.append(sentence)
                print language

            elif language == "finnish":
                finnish_sent.append(sentence)
                print language

            elif language == "french":
                french_sent.append(sentence)
                print language

            elif language == "german":
                german_sent.append(sentence)
                print language

            elif language == "hungarian":
                hungarian_sent.append(sentence)
                print language

            elif language == "italian":
                italian_sent.append(sentence)
                print language

            elif language == "norwegian":
                norwegian_sent.append(sentence)
                print language

            elif language == "portuguese":
                portuguese_sent.append(sentence)
                print language

            elif language == "russian":
                russian_sent.append(sentence)
                print language

            elif language == "spanish":
                spanish_sent.append(sentence)
                print language

            elif language == "swedish":
                swedish_sent.append(sentence)
                print language

            elif language == "turkish":
                turkish_sent.append(sentence)
                print language

        if danish_sent:
            dan_file = open("danish.txt", "w")
            for item in danish_sent:
                dan_file.write(str(item) + '\n')

        if dutch_sent:
            dut_file = open("dutch.txt", "w")
            for item in dutch_sent:
                dut_file.write(str(item) + '\n')

        if english_sent:
            eng_file = open("english.txt", "w")
            for item in english_sent:
                eng_file.write(str(item) + '\n')

        if finnish_sent:
            fin_file = open("finnish.txt", "w")
            for item in finnish_sent:
                fin_file.write(str(item) + '\n')

        if french_sent:
            fra_file = open("french.txt", "w")
            for item in french_sent:
                fra_file.write(str(item) + '\n')

        if german_sent:
            ger_file = open("german.txt", "w")
            for item in german_sent:
                ger_file.write(str(item) + '\n')

        if hungarian_sent:
            hun_file = open("hungarian.txt", "w")
            for item in hungarian_sent:
                hun_file.write(str(item) + '\n')

        if italian_sent:
            ita_file = open("italian.txt", "w")
            for item in ita_sent:
                ita_file.write(str(item) + '\n')

        if norwegian_sent:
            nor_file = open("norwegian.txt", "w")
            for item in norwegian_sent:
                nor_file.write(str(item) + '\n')

        if portuguese_sent:
            por_file = open("portuguese.txt", "w")
            for item in portuguese_sent:
                por_file.write(str(item) + '\n')

        if russian_sent:
            rus_file = open("russian.txt", "w")
            for item in russian_sent:
                rus_file.write(str(item) + '\n')

        if spanish_sent:
            spa_file = open("spanish.txt", "w")
            for item in spanish_sent:
                spa_file.write(str(item) + '\n')

        if swedish_sent:
            swe_file = open("swedish.txt", "w")
            for item in swedish_sent:
                swe_file.write(str(item) + '\n')

        if turkish_sent:
            tur_file = open("turkish.txt", "w")
            for item in turkish_sent:
                tur_file.write(str(item) + '\n')

        start_over = raw_input(
            "Would you like to run the program again? Type 'y' for yes and 'n' for no."
        )
        if start_over == "y":
            lang_detection()
        if start_over == "n":
            print "Bye!"
            exit(0)
        else:
            print("Please enter a valid input.")
def load_stop_words():
    STOP_WORDS_PATH = join('sw.txt')
    content = read_file(STOP_WORDS_PATH)
    lt = LineTokenizer()
    return lt.tokenize(content)
Exemple #23
0
import os
from nltk.tokenize import LineTokenizer
import pandas as pd
#import json


# In[2]:


myPath="./docs/"


# In[3]:


ltokenizer=LineTokenizer()


# In[4]:


def get_rtf_names(loc_path):
    rtf_names=[]
    names=os.listdir(path=loc_path)
    for name in names:
        if ".rtf" in name:
            rtf_names.append(loc_path+name)
    return rtf_names


# In[ ]:
Exemple #24
0
import pickle
import sys
from nltk.tokenize import LineTokenizer
file_name = sys.argv[1]
filename_and_extension = file_name.split('.')
without_extension = filename_and_extension[0]+"."+filename_and_extension[1]+"_tagged."+filename_and_extension[2]
count = 1
data = open(file_name, 'r').read()
list_of_data = LineTokenizer(blanklines='keep').tokenize(data)
sample_sentences = list_of_data[:]
tagged_corpus = open(without_extension, "a+")
tagger = pickle.load(open('/home/tsegaye/PycharmProjects/POS 2018/models/tnt_back_off.pkl', 'rb'))
for each_item in sample_sentences:
    tagged_sentence= tagger.tag(each_item.split())
    tagged_corpus.write(' '.join('{}|{}|{}'.format(x[0],x[0], x[1]) for x in tagged_sentence))
    tagged_corpus.write('\n')
    print(count)
    count += 1
print('tagging words finished')
tagged_corpus.close()
import json
import os
import time
from nltk.corpus import stopwords
from nltk.tokenize import LineTokenizer, RegexpTokenizer
from pyspark import SparkContext, SparkConf
from collections import OrderedDict
from constants import *
from wordProcessing import *

start_time = time.time()
stopwords_list = stopwords.words('english')
line_tokenizer = LineTokenizer()
regex_tokenizer = RegexpTokenizer("[\w']+")

def extract(article):
	if article == None or article == '':
  		print('I got a null or empty string value for article in a file')
  		return {}
	obj = json.loads(article)
	article_id = int(obj['id'])
	text = obj['text']
	
	text = replace_punctuations(text)
	text = strip_accents(text)

	lines = line_tokenizer.tokenize(text)
	word_map = {}
	for line in lines:
		#print(line)
		words = regex_tokenizer.tokenize(line)
Exemple #26
0
import random
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.tokenize import LineTokenizer, RegexpTokenizer
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.stem import SnowballStemmer
from processor import Processor as Proc

data_folder = './data'
encoding = 'UTF8'
language = 'italian'

wordTok = RegexpTokenizer(
    r'(\w+|@\w+|<3|(\:\-?\))|(\:\-?\()|(\;\-?\))|((\:|(X|x))\-?(D|d)))')
sentTok = LineTokenizer()
reader = CategorizedPlaintextCorpusReader(data_folder,
                                          r'SENTIPOLC-.*\.txt',
                                          cat_pattern=r'SENTIPOLC-(\w+)\.txt',
                                          encoding=encoding,
                                          word_tokenizer=wordTok,
                                          sent_tokenizer=sentTok)

pos_tweets = reader.sents(reader.fileids('pos'))
neg_tweets = reader.sents(reader.fileids('neg'))

# Inspection
rndP = random.randrange(len(pos_tweets))
rndN = random.randrange(len(neg_tweets))
print 'Pos:\n', pos_tweets[rndP:rndP + 3], '\nNeg:\n', neg_tweets[rndN:rndN +
                                                                  3], '\n'
    def extract_data(self,
                     filepath,
                     ind_features=_PARAIND_FEAT,
                     dep_features=_PARADEP_FEAT,
                     labels_per_sent=None,
                     labels_per_window=None):
        """Extract features, reduce dimensions with a PCA and return data.

        Exports raw- and PCA-reduced data both in arff- and numpy-format.
        """
        start = time.clock()
        self.dictVectorizer = DictVectorizer(sparse=False)
        filename = os.path.split(filepath)[1]
        directory = os.path.split(filepath)[0]
        plain_reader = PlaintextCorpusReader(
            directory, [filename],
            word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|[" +
                                           string.punctuation + "]"),
            sent_tokenizer=LineTokenizer(blanklines="discard"),
            encoding='utf8')

        # create new subdir for extracted data
        if _NEW_SUBDIR is not None:
            path = os.path.join(directory, _NEW_SUBDIR)
            if not os.path.exists(path):
                os.makedirs(path)
            path = os.path.join(path, os.path.splitext(filename)[0])
            # print "path {}".format(path)
        else:
            path = os.path.splitext(filepath)[0]
            # print "path {}".format(path)

        # filepaths for weka- and numpy-files
        arff_filepath = path + ".arff"
        arff_filepath_pca = path + "_pca95.arff"
        numpy_filepath = path + ".npy"
        numpy_filepath_pca = path + "_pca95.npy"

        # print(":time: Reader created, time elapsed {}").format(time.clock() - start)
        paras = plain_reader.paras()
        # print(":time: Paras created, time elapsed {}").format(time.clock() - start)
        sents = plain_reader.sents()
        # print(":time: Sents created, time elapsed {}").format(time.clock() - start)

        # get paragraph boundaries for sliding-window
        self.boundaries = util.get_boundaries(paras)
        boundaries_backup = self.boundaries

        # check if all files necessary exist, if yes - unpickle/load them and return data
        if util.files_already_exist([
                numpy_filepath_pca,
        ]):
            print "Features already extracted. Calculating clusters...\n"
            matrix_sklearn_pca = numpy.load(numpy_filepath_pca)
            return filepath, self.boundaries, matrix_sklearn_pca, len(sents)

        # save correct target-labels and additional info of current data
        targets_path = open(path + ".tbs", "wb")
        pickle.dump((labels_per_sent, labels_per_window, boundaries_backup,
                     len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path)

        # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start)
        self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE,
                                          ind_features, dep_features)
        # self.data[year] = self.extract_features_para(paras, ind_features, dep_features)
        # print(":time: Features extracted, time elapsed {}").format(time.clock() - start)
        self.all_features = self.unified_features(self.data)
        # print(":time: Unified features, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = self.feature_matrix_sklearn(
            self.generator_data(self.data))
        # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = util.normalize(matrix_sklearn)
        # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start)

        print "Exporting raw-data..."
        util.export_arff(matrix_sklearn,
                         self.dictVectorizer.get_feature_names(),
                         arff_filepath,
                         filename + "_RAW",
                         labels_per_window,
                         file_info=None)
        numpy.save(numpy_filepath, matrix_sklearn)

        # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape)
        feature_names, feature_names_part = None, None
        if _DO_PCA:
            print "PCA calculation..."
            matrix_sklearn_pca, feature_names = util.pca(
                matrix_sklearn, self.dictVectorizer.get_feature_names())
            util.export_arff(matrix_sklearn_pca,
                             feature_names,
                             arff_filepath_pca,
                             filename + "_PCA95",
                             labels_per_window,
                             file_info=None)
            numpy.save(numpy_filepath_pca, matrix_sklearn_pca)

            del matrix_sklearn
        gc.collect()
        return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
Exemple #28
0
CCC = text_match(
    '''澎湃新闻(www.thepaper.cn)梳理发现,9月24日,青岛市在对青岛港大港公司进口冷链产品装卸工人进行定期例行检测时,发现2例新型冠状病毒肺炎无症状感染者。10月11日,青岛市又发现3例新冠肺炎无症状感染者
随后,青岛将上述3例无症状感染者的密切接触者和市胸科医院相关人员作为''', "青岛")

#CCC = text_match('''asdccc''',"ccc")
print("CCC text_match", CCC)
print("CCC text_match", CCC.start())
print("CCC text_match", CCC.group(0))  #青岛
print("CCC text_match", CCC.groups())
print("CCC text_match", CCC.group())  #青岛

print("-------------------------------------------------------------")
print("-------------------------------------------------------------")
#---------line split----------------------------------------------------------------------------------------------
lTokenizer = LineTokenizer()
#Print("lTokenizer result:",
AAA = lTokenizer.tokenize(
    '''澎湃新闻(www.thepaper.cn)梳理发现,9月24日,青岛市在对青岛港大港公司进口冷链产品装卸工人进行定期例行检测时,发现2例新型冠状病毒肺炎无症状感染者。10月11日,青岛市又发现3例新冠肺炎无症状感染者
随后,青岛将上述3例无症状感染者的密切接触者和市胸科医院相关人员作为高风险人群进行重点监测,共排查到密切接触者和市胸科医院所有在院患者及陪护人员377人,其中新增核酸检测结果阳性9人,包括8名市胸科医院在院患者及陪护人员、1名患者家属,经专家组判定,其中4例为确诊病例、5例为无症状感染者。
青岛市卫健委通报称,截至10月11日23时,青岛市共发现6例确诊病例,6例无症状感染者。到目前发现的所有确诊病例和无症状感染者均与市胸科医院高度关联。而市胸科医院部分独立区域承担着收治境外输入新冠病毒感染者的任务。
澎湃新闻(www.thepaper.cn)注意到,山东正调集全省流调和检测方面的机动力量,赴青岛提供支持。山东省委、省政府已经在青岛设立前方指挥部,青岛市正开展大规模核酸检测,全面彻底排查可能的感染者,以尽快实现城区人员检测全覆盖。
''')
print(AAA)

#---------get_close_matche----------------------------------------------------------------------------------------------
aaa = difflib.get_close_matches(
    '(www.thepaper.cn)注意到,山东正调集全省流调和检测方面的机动力量,赴青岛提供支持。山东省委、省政府已经在青岛设立前方指挥部,青岛市正开展大规模核酸检测,全面彻底排查可能的感染者,以尽快实现城区人员检测全覆盖。',
    AAA,
    1,
    cutoff=0.1)
Exemple #29
0
#python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \n ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ"   ' 	'
tokens = []

if(dt == " "):
	tokens = SpaceTokenizer().tokenize(s)

#python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \t ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ"   '\t'
from nltk.tokenize import TabTokenizer


if(dt == '\\t'):
	print "dt = "+dt
	s = s.replace(u'\\t','\t')
	tokens = TabTokenizer().tokenize(s)

#python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \n ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ"   '\n'
from nltk.tokenize import LineTokenizer

if(dt == '\\n'):
	s = s.replace(u'\\n','\n')
	tokens = LineTokenizer(blanklines='discard').tokenize(s)


for token in tokens:
	print token.encode('utf-8','replace')

#print ', '.join(repr(x.encode('utf-8','replace')) for x in tokens)
#print type(tokens)(x.encode('utf-8') for x in tokens)

Exemple #30
0
def load_stop_words(lang):
    content = read_file(STOP_WORDS_PATH)
    lt = LineTokenizer()
    return lt.tokenize(content)