コード例 #1
0
def missingCorpus(corpusdir):
    try:
        os.makedirs(corpusdir)
    except OSError:
        if not os.path.isdir(corpusdir):
            raise
    
    try:
        os.makedirs(corpusdir+'/ratings')
    except OSError:
        if not os.path.isdir(corpusdir+'/ratings'):
            raise
    hotel = json.load(open(data_path+file))
    stopset = hotelNameAddress(hotel)
    stopgroup = ""
    for e in stopset:
        stopgroup += e+" "
    stopgroup = stopgroup[0:-1]
    with open(corpusdir+'/stopset.txt', 'w') as fout:
        fout.write(stopgroup)
    revNum = 0
    for review in hotel.get('Reviews'):
        revNum += 1
        contentOut = ""
        overall = review.get('Ratings').get('Overall')
        content = pos_tag_sents([word_tokenize(sentence) for sentence in sent_tokenize(review.get('Content'))])
        with open (corpusdir+'/ratings/OverallRating'+str(revNum)+'.txt', 'w') as fout:
            fout.write(overall)
        with codecs.open(corpusdir+'/Review'+str(revNum)+'.txt', 'w', encoding = "utf-8") as fout:
            for sentence in content:
                for word, pos in sentence:
                    contentOut += word+"/"+pos+" "
                contentOut += '\n'
            fout.write(contentOut)
コード例 #2
0
def twitter_token():
    from nltk.corpus import twitter_samples
    from nltk.tag import pos_tag_sents

    tweets = twitter_samples.strings('positive_tweets.json')
    tweets_tokens = twitter_samples.tokenized('positive_tweets.json')

    tweets_tagged = pos_tag_sents(tweets_tokens)
    """
    JJ:Adjective
    singular nouns (NN)
    plural nouns (NNS)
    
    """
    JJ_count = 0
    NN_count = 0

    for tweet in tweets_tagged:
        for key, tag in tweet:
            #tag = pair[1]
            if tag == 'JJ':
                JJ_count += 1
            elif tag == 'NN':
                NN_count += 1

    print('Total number of adjectives = ', JJ_count)
    print('Total number of nouns = ', NN_count)
コード例 #3
0
def nouns_verbs_to_POSTAG(sample):
    '''
    Same principle as to_POSTAG but only applies transformation to nouns and verbs
    :param sample: full text sample
    :type sample: str
    Output: str
    '''
    # Convert sample into input form needed by pos_tag_sents: list(list(str))
    # Needed: sentence-tokenize, then word-tokenize
    sents_list = [word_tokenize(sent) for sent in sent_tokenize(sample)]
    # pos-tagging
    POS = pos_tag_sents(sents_list)
    # extracting the postags and put into one str to replace original sample
    # list(list((word, tag),(word, tag),(word, tag)))
    POS_sents = []
    for sent_list in POS:
        word_pos_seq = []
        for word, tag in sent_list:
            if tag.startswith('NN') or tag.startswith('VB'):
                word_pos_seq.append(tag)
            else:
                word_pos_seq.append(word)

        word_pos_seq = ' '.join(
            word_pos_seq)  # This will be a string like 'NNP sees the NN .'
        POS_sents.append(word_pos_seq)
    POS_sample = ' '.join(POS_sents)

    return POS_sample
コード例 #4
0
def main():
    sentlist, dictlist = extractinfo()
    print('Info Extracted..')
    postaglist = pos_tag_sents([sent.split() for sent in sentlist],
                               tagset='universal')
    print('POS Tagging Done')
    data = makedata(postaglist, dictlist)
    printdata(data)

    vocabwords = makewordvocab()
コード例 #5
0
ファイル: readers.py プロジェクト: CharleyPeng1/pke
 def __init__(self, path):
     self.sentences = []
     with codecs.open(path, 'r', 'utf-8') as f:
         sentences = [word_tokenize(s) for s in sent_tokenize(f.read())]
         tuples = pos_tag_sents(sentences)
         for sentence in tuples:
             self.sentences.append({
                 "words" : [u[0] for u in sentence],
                 "POS" : [u[1] for u in sentence]
             })
コード例 #6
0
 def __init__(self, path):
     self.sentences = []
     with codecs.open(path, 'r', 'utf-8') as f:
         sentences = [word_tokenize(s) for s in sent_tokenize(f.read())]
         tuples = pos_tag_sents(sentences)
         for sentence in tuples:
             self.sentences.append({
                 "words" : [u[0] for u in sentence],
                 "POS" : [u[1] for u in sentence]
             })
コード例 #7
0
ファイル: readers.py プロジェクト: xiaoman0220/pke
 def __init__(self, path=None, input_text=None):
     self.sentences = []
     raw_text = input_text
     if path is not None:
         with codecs.open(path, 'r', 'utf-8') as f:
             raw_text = f.read()
     sentences = [word_tokenize(s) for s in sent_tokenize(raw_text)]
     tuples = pos_tag_sents(sentences)
     for sentence in tuples:
         self.sentences.append({
             "words": [u[0] for u in sentence],
             "POS": [u[1] for u in sentence]
         })
コード例 #8
0
def posTagging(orgFile,targetFile):
	count_org , count= 0, 0
	english_punctuations = ['(', '[', '<', ')', ']','>',',', '.', ':',
							';', '?', '!', '@', '#', '%', '$', '*',' ','','\n']
	try:	
		f=open(orgFile,'r')
		lines=f.readlines()
	# except Exception as e:
		# print(orgFile,"文件解码错误\n",traceback.format_exc())
	except UnicodeDecodeError:
		print("Error:",orgFile,"文件解码错误")
		return 2,1
	else:
		f.close()

	ff=open(targetFile,'w')
	for line in lines:
		#统计原文词数
		line_list = line.split()
		words_list = [i for i in line_list if i not in english_punctuations]
		count_org += len(words_list) 
		for w in words_list:  #处理Prob(source =l) 
			if '(' in w and w[0]!='(' and w[-1]!='(':
				# print(w)
				count_org += 1
			elif ')' in w and w[0]!=')' and w[-1]!=')' and w[-1] not in english_punctuations[6:12]: #(2016),
				# print(w)
				count_org += 1

		line_str, pre_word = '',''
		res = pos_tag_sents([word_tokenize(i) for i in sent_tokenize(line)])
		for sents in res:  #每一个分句
			for word_tuple in sents: #每个分句的分词
				if word_tuple[0] in english_punctuations:
					word = ''.join([' ' for i in english_punctuations[0:3] if word_tuple[0]==i]) + word_tuple[0]
				else:
					if pre_word in [' '+ j for j in english_punctuations[0:3]]:
						word = "_".join(word_tuple)
					else:
						word = ' '+"_".join(word_tuple)
					count += 1
				pre_word = word
				line_str += word
		ff.write(line_str)
		ff.write('\n')
	ff.close()
	if count==0:
		print(orgFile,"文件总词数为0")
		return 2,1
	return count_org,count
	print("原文总词数",count_org," 附码后总词数:",count)	
コード例 #9
0
    def pos_tag(self, tokens):
        return pos_tag_sents(tokens)


# data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."

# preprocessor = Preprocessor()
# print preprocessor.proccess_text(data)

# dataset = [
# 'Today was a bad day',
# 'I love running in the park',
# 'I used to have a cat when I was a kid'
# ]

# print preprocessor.process_dataset(dataset)
コード例 #10
0
def lemmatization(tokenized_sents):
    # lemmatization
    lem = WordNetLemmatizer()
    sents_pos_tag = tag.pos_tag_sents(tokenized_sents, lang="eng")
    wn_sents_pos_tag = []
    for sent in sents_pos_tag:
        wn_sents_pos_tag.append(
            map(lambda tup: (tup[0], convert_nltk2wn_pos(tup[1])), sent))
    lem_sents = []
    for sent in wn_sents_pos_tag:
        s = []
        for word, pos in sent:
            if pos is None:
                s.append(word)
            else:
                s.append(lem.lemmatize(word, pos))
        lem_sents.append(s)
    return lem_sents
コード例 #11
0
    def transform1(self, X):
        # word
        X_tokenized1 = [self.feature_fxn(t) for t in X]

        # POS label
        tweets_tagged = pos_tag_sents(X_tokenized1)
        X_tokenized = []
        for i in range(len(tweets_tagged)):
            X_tokenized.append([])
            # i 是第几篇文章
            for j in range(len(tweets_tagged[i])):
                X_tokenized[i].append(tweets_tagged[i][j][1])
        # print(X_tokenized[2])
        # print("------------------------")
        X = []
        for tweet in X_tokenized:
            X.append(
                [self.feature_map[w] for w in tweet if w in self.feature_map])
        X = sequence.pad_sequences(X, maxlen=self.max_len, padding='post')
        return X
コード例 #12
0
def noun_freq_abstraction(sample, model=None):
    '''
    Combination of above noun abstraction methods and get_word_freq
    '''
    # if using default tagger (i.e. nltk tagger)
    if model == None:
        sents_list = [word_tokenize(sent) for sent in sent_tokenize(sample)]
        # pos-tagging
        POS = pos_tag_sents(sents_list)
        # extracting the postags and put into one str to replace original sample
        # list(list((word, tag),(word, tag),(word, tag)))
        POS_sents = []
        for sent_list in POS:
            word_pos_seq = []
            for word, tag in sent_list:
                if tag.startswith('NN'):
                    freq = str(get_word_freq(word))
                    info = tag + '_' + freq
                    word_pos_seq.append(info)
                else:
                    word_pos_seq.append(word)
            word_pos_seq = ' '.join(
                word_pos_seq
            )  # This will be a string like 'NNP_5.3 sees the NN_4.8 .'
            POS_sents.append(word_pos_seq)
        POS_sample = ' '.join(POS_sents)
    # if using, e.g. spacy
    else:
        doc = model(sample)
        tokens = []
        for token in doc:
            if token.tag_.startswith('N'):
                freq = str(get_word_freq(token.text))
                info = token.tag_ + '_' + freq
                tokens.append(info)
            else:
                tokens.append(token.text)
        POS_sample = ' '.join(tokens)

    return POS_sample
コード例 #13
0
def to_POSTAG(sample):
    '''
    Turns each sample into a long string of postags, abstracting away from the actual words
    :param sample: full text sample
    :type sample: str
    Output: str
    '''
    # Convert samplt into input form needed by pos_tag_sents: list(list(str))
    # Needed: sentence-tokenize, then word-tokenize
    sents_list = [word_tokenize(sent) for sent in sent_tokenize(sample)]
    # pos-tagging
    POS = pos_tag_sents(sents_list)
    # extracting the postags and put into one str to replace original sample
    POS_sents = []
    for sent_list in POS:
        pos_seq = [tag for word, tag in sent_list]
        pos_seq = ' '.join(
            pos_seq)  # This will be a string like 'NNP VB DET NNP .'
        POS_sents.append(pos_seq)
    POS_sample = ' '.join(POS_sents)

    return POS_sample
コード例 #14
0
def get_koss(fn_k, my_env, sysdic, stopdic):  #日英の場合は英語
    #s_mecab_ko = get_s_mecab(fn_k, 'ko', my_env)
    #koss = get_mrphs(s_mecab_ko, 'ko')

    #英語はまずget_mrphsをしない
    infile = open(fn_k, 'r', encoding='UTF-8')  #, encoding='UTF-8'
    koss = []
    lines = infile.readlines()

    sents = []
    for l in lines:
        t = word_tokenize(l.strip())
        #koss.append(t)
        sents.append(t)

    lemmatizer = WordNetLemmatizer()
    tags = pos_tag_sents(sents)

    for sent in tags:
        tmp = []
        for w_pair in sent:
            word = w_pair[0].lower()  #单词先全部小写
            #if w_pair[1] in ["NN","NNS"]:
            #    word = word.capitalize()
            postag = penn_to_wn(w_pair[1])
            if (word in stopdic):
                continue
            if (postag == None or postag == "v"):
                #tmp.append(word)
                continue
            else:
                tmp.append(lemmatizer.lemmatize(word, postag))

        koss.append(tmp)

    infile.close()
    return koss
コード例 #15
0
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag_sents

# Load tokenized data of tweets
tokens = twitter_samples.tokenized('positive_tweets.json')

# tag tokens
tagged_tokens = pos_tag_sents(tokens)

# print output of the couple of tagged tokens

## count nouns
noun_count = 0
adj_count = 0

for tweet in tagged_tokens:
    for pair in tweet:
        tag = pair[1]
        if tag == 'NN':
            noun_count += 1

print('Number of nouns: ', noun_count)
コード例 #16
0
ファイル: sent2vec10.py プロジェクト: trenslow/LightRel
        if 'train' in record_file:
            all_M_vals = [record[6] for record in records]
            M = max(all_M_vals)
            avg_M = sum(all_M_vals) // len(records)
            if use_avg_M_plus_mode:
                M_counts = Counter(all_M_vals)
                avg_M += max(M_counts.items(), key=operator.itemgetter(1))[0]
            positions = create_pos_index(sentences_and_indexes, M, avg_M)
            num_positions = len(positions) if fire_positions else 0
        norm_sents = [
            normalize(M, sentence, avg_M) for sentence in sentences_and_indexes
        ]
        if fire_tagger:
            tagged_sents = pos_tag_sents(
                [[w if w is not None else 'none' for w in sent]
                 for sent in norm_sents])
        else:
            tagged_sents = None
        if max_suffix_size == 0:
            word_lengths = [len(w) for w in words]
            suffix_length = sum(word_lengths) // num_words
        else:
            suffix_length = max_suffix_size
        num_char_embeddings = suffix_length * char_emb_dims
        num_cats = cat_dims * suffix_length
        len_token_vec = num_words + num_positions + num_clusters + num_suffixes + num_shapes + num_tags + num_embeddings + num_char_embeddings + num_cats
        feat_val = ':1.0'

        with open(out_file, 'w+') as lib_out:
            for i, sentence in enumerate(sentences_and_indexes):
コード例 #17
0
def tag_POS(process_pos):
    return pos_tag_sents(process_pos)
コード例 #18
0
     if not os.path.isdir(corpusdir):
         raise
 
 try:
     os.makedirs(corpusdir+'/ratings')
 except OSError:
     if not os.path.isdir(corpusdir+'/ratings'):
         raise
 hotel = json.load(open(path+file))
 stopset = hotelNameAddress(hotel)
 stopgroup = ""
 for e in stopset:
     stopgroup += e+" "
 stopgroup = stopgroup[0:-1]
 with open(corpusdir+'/stopset.txt', 'w') as fout:
     fout.write(stopgroup)
 revNum = 0
 for review in hotel.get('Reviews'):
     revNum += 1
     contentOut = ""
     overall = review.get('Ratings').get('Overall')
     content = pos_tag_sents([word_tokenize(sentence) for sentence in sent_tokenize(review.get('Content'))])
     with open (corpusdir+'/ratings/OverallRating'+str(revNum)+'.txt', 'w') as fout:
         fout.write(overall)
     with codecs.open(corpusdir+'/Review'+str(revNum)+'.txt', 'w', encoding = "utf-8") as fout:
         for sentence in content:
             for word, pos in sentence:
                 contentOut += word+"/"+pos+" "
             contentOut += '\n'
         fout.write(contentOut)
 
コード例 #19
0
from nltk.corpus import gutenberg
from gensim.models import Word2Vec
#for POS (parts of speech)
from nltk.tag import pos_tag_sents

g = Word2Vec(gutenberg.raw())
#g_tokens = g.tokenized('positive_tweets.json')

g_tagged = pos_tag_sents(g)
g_len = len(g_tagged)
print(g_len)

#JJ_count = 0
#NN_count = 0

#for tweet in tweets_tagged:
#	for pair in tweet:
#		tag = pair[1]
#		if tag == 'JJ':
#			JJ_count += 1
#		elif tag == 'NN':
#			NN_count += 1

#print ('Total number of adjectives = ', JJ_count)
#print ('Total number of nouns = ', NN_count)

コード例 #20
0
from nltk.corpus import gutenberg
#for POS (parts of speech)
from nltk.tag import pos_tag_sents
from nltk import word_tokenize

emma = gutenberg.words('austen-emma.txt')
emma_tokens = word_tokenize('austen-emma.txt')

emma_tagged = pos_tag_sents(emma_tokens)

JJ_count = 0
NN_count = 0

for words in emma_tagged:
    for pair in words:
        tag = pair[1]
        if tag == 'JJ':
            JJ_count += 1
        elif tag == 'NN':
            NN_count += 1

print('Total number of adjectives = ', JJ_count)
print('Total number of nouns = ', NN_count)
コード例 #21
0
# https://www.digitalocean.com/community/tutorials/how-to-work-with-language-data-in-python-3-using-the-natural-language-toolkit-nltk
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag_sents

tweets = twitter_samples.strings('positive_tweets.json')
tweets_tokens = twitter_samples.tokenized('positive_tweets.json')
tweets_tokenzied = pos_tag_sents(tweets_tokens)
print(tweets_tokenzied)

JJ_count = 0
NN_count = 0

for tweet in tweets_tokenzied:
    # print("===",tweet)

    for pair in tweet:
        tag = pair[1]
        if tag == 'JJ':
            JJ_count += 1
        elif tag == 'NN':
            NN_count += 1

print('NN count==', NN_count)
print('JJ_count==', JJ_count)
コード例 #22
0
ff=open('demo1.txt','w',encoding='UTF-8')
for line in lines:
	#统计原文词数
	line_list = line.split()
	words_list = [i for i in line_list if i not in english_punctuations]
	count_org += len(words_list) 
	for w in words_list:  #处理Prob(source =l) 
		if '(' in w and w[0]!='(' and w[-1]!='(':
			print(w)
			count_org += 1
		elif ')' in w and w[0]!=')' and w[-1]!=')' and w[-1] not in english_punctuations[6:12]: #(2016),
			print(w)
			count_org += 1

	line_str, pre_word = '',''
	res = pos_tag_sents([word_tokenize(i) for i in sent_tokenize(line)])
	for sents in res:  #每一个分句
		for word_tuple in sents: #每个分句的分词
			if word_tuple[0] in english_punctuations:
				word = ''.join([' ' for i in english_punctuations[0:3] if word_tuple[0]==i]) + word_tuple[0]
			else:
				if pre_word in [' '+ j for j in english_punctuations[0:3]]:
					word = "_".join(word_tuple)
				else:
					word = ' '+"_".join(word_tuple)
				count += 1
			# print("_".join(word_tuple))
			pre_word = word
			line_str += word
	ff.write(line_str)
	ff.write('\n')
コード例 #23
0
    def extract_entity_patterns(self, chunk={}):
        color_mapping = {
            'magenta': ['NN', 'NNS'],
            'green': ['NNP', 'NNPS'],
            'cyan': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
            'yellow': ['JJ', 'JJR', 'JJS']
        }
        # reverse color mapping
        color_mapping = {
            v: k
            for k, values in color_mapping.iteritems() for v in values
        }
        for entity, relations in chunk.iteritems():
            cleaned_subject_entity_name = uri_rewriting.strip_cleaned_name(
                entity)
            subject_entity = uri_rewriting.strip_name(entity)
            for rel_ontology, values in relations.iteritems():
                target_resources = values['resources']
                sentences = values['sentences']
                rel_ontology = rel_ontology.split('/')[-1]
                data = [{
                    'entity': cleaned_subject_entity_name,
                    'relation': rel_ontology,
                    'resource': res,
                    'sentence': sent
                } for res in target_resources for sent in sentences
                        if sent.contains_any_link([res]) and res != entity]

                # remove needless sentence information based on relation facts
                # data = map(self.shorten_sentence, data)
                # POS tag sentences
                for entry in data:
                    sentence = entry['sentence']
                    if sentence.number_of_tokens() > 50:
                        continue  # probably too long for stanford tokenizer
                    resource = entry['resource']
                    nl_sentence = sentence.as_string()
                    relative_position = sentence.relative_pos
                    entry['nl sentence'] = nl_sentence
                    tokenized_sentences = map(word_tokenize, [nl_sentence])
                    pos_tagged_sentences = pos_tag_sents(
                        tokenized_sentences).pop()

                    object_addresses = sentence.addresses_of_link(resource)
                    object_entity = uri_rewriting.strip_name(resource)
                    pattern = self.pattern_extractor.extract_pattern(
                        nl_sentence, object_addresses, relative_position,
                        self.type_learning, subject_entity, object_entity)
                    if pattern is not None:
                        values['patterns'].append(pattern)
                        entry['pattern'] = pattern

                    # color sentence parts according to POS tag
                    colored_sentence = [
                        colored(word, color_mapping.setdefault(pos, 'white'))
                        for word, pos in pos_tagged_sentences
                    ]
                    colored_sentence = ' '.join(colored_sentence)
                    colored_sentence = re.sub(
                        r' (.\[\d+m),', ',',
                        colored_sentence)  # remove space before commas
                    entry['colored_sentence'] = colored_sentence

                self.matches.extend(data)
コード例 #24
0
from typing import List, Dict # type hinting is important
from nltk.corpus import twitter_samples # me JSONs with tweets
from nltk.tag import pos_tag_sents # part of speach tagger

# Each tweet is an item
pos_tweets: list = twitter_samples.strings('positive_tweets.json')

# Goal: count adjective: descriptor and nouns: thing
'''
1. Tokenization: breaking up sequence of strings into /words(?)/phrases
Regardless, each piece is a token.
* In this case the strings are being split at each space
'''
pos_tweets_tokens: list = twitter_samples.tokenized('positive_tweets.json')

# tuples with token and tag
pos_tweets_tagged: list = pos_tag_sents(pos_tweets_tokens)

'''
Tags:
JJ: adjective
NN: singular noun
NNS: plural noun
'''

# Let us count how many times these appear!
コード例 #25
0
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag_sents

tweets = twitter_samples.strings('positive_tweets.json')
tweets_tokens = twitter_samples.tokenized('positive_tweets.json')

JJ_count = 0
NN_count = 0

tweets_tagged = pos_tag_sents(tweets_tokens)

for tweet in tweets_tagged:
    for pair in tweet:
        tag = pair[1]
        if tag == 'JJ':
            JJ_count += 1
        elif tag == 'NN':
            NN_count += 1

print('Total number of adjectives = ', JJ_count)
print('Total number of nouns = ', NN_count)
コード例 #26
0
ファイル: lsverbs.py プロジェクト: av-dx/cl1-project
    "data/FMFS_Module_8_Verified_Post_Verbatim_TScript.txt",
}
text = ""
for fname in fnames2:
    inFile = open(fname, 'r')
    text += inFile.read()

text = text.replace('’', "'")

verbs = {}
sents = sent_tokenize(text)
# print(sents)
for i, s in enumerate(sents):
    sents[i] = word_tokenize(s)

taggedsents = pos_tag_sents(sents)
# print(taggedsents)
for s in taggedsents:
    # print(s)
    for w, t in s:
        if t in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            # print(w)
            if (verbs.get(w.lower()) is not None):
                verbs[w.lower()] += 1
            else:
                verbs[w.lower()] = 1

sorted_verbs = sorted(verbs.items(), key=operator.itemgetter(1))
sorted_verbs.reverse()

i = 0
コード例 #27
0
argparser.add_argument('file', help="text document")
args = argparser.parse_args()

stopwords = stopwords.words('english')

doc_path = os.path.splitext(args.file)[0]

tt = TextTilingTokenizer()
text = codecs.open(doc_path + '.txt', 'r', "utf-8").read()
parags = tt.tokenize(text)

buffer_tiled = ''
buffer_tiled_tagged = ''
buffer_tiled_tagged_clean = ''

tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags])
clean_parags = [
    filter(lambda taggedword: taggedword[0] not in stopwords, p)
    for p in tagged_parags
]

for i, p in enumerate(parags):
    buffer_tiled += p

    for word, tag in tagged_parags[i]:
        buffer_tiled_tagged += word + "/" + tag + ' '
        if word not in stopwords:
            if tag[0] == 'V': tag_abstract = 'verb'
            elif tag[0] == 'N': tag_abstract = 'noun'
            else: continue
            buffer_tiled_tagged_clean += word + ' ' + tag_abstract + '\n'
コード例 #28
0
def tagText(text):
    sents = [word_tokenize(s) for s in sent_tokenize(text)]

    taggedSents = pos_tag_sents(sents)

    return taggedSents
コード例 #29
0
from nltk.tag import pos_tag_sents
import nltk
import os

file = open('sam2.txt', 'r')
texts = file.readlines()
text_tokenized = []
for txt in texts:
    [date, text] = txt.strip().split('@')
    text = text.lower()
    text_tokenized.append(nltk.word_tokenize(text))
pos = pos_tag_sents(text_tokenized)

journal = open('journal.txt', 'w')

journal.write(
    '------------------------------------------------------------------------------------------\n'
)
journal.write('Date \t\t\t\t\tParticulars\t\t\t\t\t\t\t\t\tAmount(Rs.)\n')
journal.write('%-10s 	%-20s%20s' % ('', '  Dr.', 'Cr.\n'))
journal.write(
    '==========================================================================================\n\n'
)

cash = open(os.path.join('ledgers', 'cash'), 'w')
cash.write(
    '------------------------------------------------------------------------------------------\n'
)
cash.write('Date \t\t\t\t\tParticulars\t\t\t\t\t\t\t\t\tAmount(Rs.)\n')
cash.write('%-10s 	%-20s%20s' % ('', '  Dr.', 'Cr.\n'))
cash.write(
コード例 #30
0
# -*- coding: utf-8 -*-
"""
Created on Mon Oct  2 14:59:21 2017

@author: hp
"""

import nltk
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag_sents
JJCount = 0
NNCount = 0
tweets = twitter_samples.strings('positive_tweets.json')
"""Each tweet is stored as a string in a list varibale 'Tweets' """
tweets_token = twitter_samples.tokenized('positive_tweets.json')
"""Each string tweet needs to be broken down to keywords, phrases,symbols etc.
 these are called Tokens """
tweets_tagged = pos_tag_sents(tweets_token)
for tweet in tweets_tagged:
    for pair in tweet:
        if pair[1] == 'NN':
            NNCount += 1
        elif pair[1] == 'JJ':
            JJCount += 1
print('Total Adjectives= ', JJCount)
print('Total Nouns= ', NNCount)
コード例 #31
0
ファイル: process.py プロジェクト: vialab/docuburst-desktop

stopwords = stopwords.words("english")

doc_path = os.path.splitext(args.file)[0]

tt = TextTilingTokenizer()
text = codecs.open(doc_path + ".txt", "r", "utf-8").read()
parags = tt.tokenize(text)


buffer_tiled = ""
buffer_tiled_tagged = ""
buffer_tiled_tagged_clean = ""

tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags])
clean_parags = [filter(lambda taggedword: taggedword[0] not in stopwords, p) for p in tagged_parags]

for i, p in enumerate(parags):
    buffer_tiled += p

    for word, tag in tagged_parags[i]:
        buffer_tiled_tagged += word + "/" + tag + " "
        if word not in stopwords:
            if tag[0] == "V":
                tag_abstract = "verb"
            elif tag[0] == "N":
                tag_abstract = "noun"
            else:
                continue
            buffer_tiled_tagged_clean += word + " " + tag_abstract + "\n"