Python splitの例、nltk.data.split Pythonの例

コード例 #1

1

ファイルを表示

ファイル: nltk_helper.py プロジェクト: ercgn/11411-proj

def parseTextToSentences(text):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    data = text
    data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

    sentences = []
    for para in data.split('\n'):
        if para:
            sentences.extend(sentence_splitter.tokenize(para))
    return sentences

コード例 #2

0

ファイルを表示

def removeTag(data):
    cleanData = ''
    for word in data.split():
        if '<' and '>' in word:
            word = '. '
        cleanData = cleanData + word + ' '
    return cleanData

コード例 #3

0

ファイルを表示

def process_tokenized(data):
    """Process tokenized data and remove the newlines."""
    newdata = ""
    data = data.split("\n")
    for line in data:
        if LINE_SEPARATOR not in line:
            newdata = "{}{}".format(newdata, line.replace("\n", ""))
        else:
            newdata = "\n\n{}\n{}\n".format(newdata, line)
    return newdata

コード例 #4

0

ファイルを表示

ファイル: split_dataset.py プロジェクト: yashkgp/Ecomm-Vocab-building

def gloss_count_ratio(data, get_ratio=0):
    term_count = 0
    word_count = 0
    for key, values in allkeys.iteritems():
        if key in data:
            term_count += 1
    if (get_ratio):
        word_count += len(data.split())
        return term_count, float(term_count) / word_count
    else:
        return term_count

コード例 #5

0

ファイルを表示

ファイル: nltkHelper.py プロジェクト: johndpope/Question-Answering-System

def parse_text_to_sentences(text):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    data = text
    data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

    sentences = []
    for para in data.split('\n'):
        if para:
            sentences.extend(sentence_splitter.tokenize(para))
    return sentences

コード例 #6

0

ファイルを表示

ファイル: spark_model.py プロジェクト: Nathx/parental_advisory_ml

 def extract_labels(self, binary=True):
     """
     Loads labeled dataframe and extract list of labeled subtitle ids.
     """
     data = self.key_to_labels.get_contents_as_string()
     valid_ratings = self.context.broadcast([u'R', u'PG-13', u'PG', u'G', u'NC-17'])
     labels_rdd = self.context.parallelize(data.split('\n')) \
                     .filter(lambda line: line != '' and not 'IDSubtitle' in line) \
                     .map(lambda line: (line.split(',')[0], line.split(',')[1])) \
                     .filter(lambda (file_id, rating): rating in valid_ratings.value)
     if binary:
         labels_rdd = labels_rdd.map(lambda (file_id, rating): (file_id, (rating != 'R')*'NOT_' + 'R'))
     return labels_rdd.sortByKey().cache() # for lookups

コード例 #7

0

ファイルを表示

def gloss_count_ratio(data, get_ratio=0):
    term_count = 0
    word_count = 0
    for key, values in allkeys.iteritems():
        if key in data:
            #Found term. So increase its count
            recalldict[key] += 1
            term_count += 1
    if (get_ratio):
        word_count += len(data.split())
        #print "{},{},{}".format(float(term_count)/word_count,term_count,word_count)
        return float(term_count) / word_count
    else:
        return term_count

コード例 #8

0

ファイルを表示

ファイル: new_preprocessing.py プロジェクト: KoalaMary/diploma

def prepare_text(author=None, data=None):
    # text_column = "text"
    # author_column = "author"
    # train_file = os.path.join(os.getcwd(), "my_train.csv")
    # test_file = os.path.join(os.getcwd(), "my_test.csv")

    # sentences = tokenize_text(data)
    sentences = data.split(" ")
    train_sentences = " ".join(sentences[:int(len(sentences) * 0.50)])
    test_sentences = " ".join(sentences[int(len(sentences) * 0.50):])
    train_res = [author] * 2
    test_res = [author] * 2

    return train_sentences, test_sentences, train_res, test_res

コード例 #9

0

ファイルを表示

ファイル: Sentence_Split.py プロジェクト: j-macdonald/cs53-1

def handleonetxtfile(inpname):
    basename = os.path.basename(inpname).split('.')[0]
    basename_txt = os.path.join(basename + ".txt")
    outname = os.path.join(basename + "_setnence_converted.txt")
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open(basename_txt)
    data = fp.read()
    paragraphs = [p for p in data.split('\n') if p]
    with open(outname, "a") as out:
        for paragraph in paragraphs:
            out.write('\n')
            out.write('\n'.join(tokenizer.tokenize(paragraph)))
            out.write('\n')
    out.close()

コード例 #10

0

ファイルを表示

ファイル: nltk_helper.py プロジェクト: isaaclimdc/11411-QA

def splitIntoSentences2(file_name):
  punkt_param = PunktParameters()
  punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
  sentence_splitter = PunktSentenceTokenizer(punkt_param)
  fp = open(file_name)
  data = fp.read()
  data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

  sentences = []
  for para in data.split('\n'):
    if para:
      sentences.extend(sentence_splitter.tokenize(para))
  # print '\n-----\n'.join(sentences)
  return sentences

コード例 #11

0

ファイルを表示

ファイル: W2v.py プロジェクト: saridsa1/cdc

def Tokenization(data, concept, stem, removeStopwords):
    if concept == False:
        data = BeautifulSoup(data).get_text()
        data = re.sub("\r\n", " ", data)
        data = re.sub("[^a-zA-Z0-9_]", " ", data)
        data = data.lower()
    if stem == True:
        stemmer = PorterStemmer()
        data = stemmer.stem(data)
    words = data.split()
    if removeStopwords == True:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

コード例 #12

0

ファイルを表示

ファイル: text_preparation_for_ppt.py プロジェクト: swapnil-ss/Notes2ppt

def split_sentences_nltk(
        file):  #returns the topcs separated into sentenses and topic titles
    try:
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    except:
        nltk.download('punkt')  # needed only for the first time
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    data = file.read()
    data_1 = data.split("changeTopicHere.")
    topics, data_1 = find_topic(data_1)
    topics_data = []
    i = 0
    for x in xrange(0, len(data_1)):
        topics_data.append("\n".join(tokenizer.tokenize(data_1[x])))
    return topics, topics_data

コード例 #13

0

ファイルを表示

ファイル: parse.py プロジェクト: sfu-natlang/xtag-nltk

 def openparsefile(self):
     self.clear()
     self.filename = tkFileDialog.askopenfilename(**self.file_opt)
     if not self.filename:
         return
     data = open(self.filename, 'r').read()
     self.initree = {}
     filtdata = ""
     self.ininame = {}
     for entry in data.split('\n\n'):
         if not entry:
             continue
         for line in entry.split("\n"):
             elements = line.split()
             if len(elements) < 4:
                 continue
             if len(elements) == 5:
                 lex_list = word_to_features(elements[0])
                 fset = lex_search(lex_list, {}, self.alltrees)
                 for cata in fset:
                     if elements[4] in fset[cata]:
                         tree = fset[cata][elements[4]]
                         tree.lexicalize()
                         tree._lex = False
                         self.initree[elements[0]] = tree
                         self.ininame[elements[0]] = elements[4]
             self.relmap[elements[0]] = elements[1]
             self.tagmap[elements[0]] = elements[3]
             filtdata += "\t".join(elements[:4])
             filtdata += '\n'
         filtdata += "\n\n"
     self.graphs = [
         DependencyGraph(entry) for entry in filtdata.split('\n\n')[:-1]
         if entry
     ]
     trees = [graph.tree() for graph in self.graphs]
     count = 0
     for t in trees:
         key = 'tree ' + str(count)
         self._import_trees[key] = t
         count += 1
     self.populate_tree('', self._import_trees)
     self._import_trees_selected = True
     self.init_selector(trees[0])

コード例 #14

0

ファイルを表示

ファイル: parse.py プロジェクト: sfu-natlang/xtag-nltk

 def openparsefile(self):
     self.clear()
     self.filename = tkFileDialog.askopenfilename(**self.file_opt)
     if not self.filename:
         return
     data = open(self.filename, 'r').read()
     self.initree = {}
     filtdata = ""
     self.ininame = {}
     for entry in data.split('\n\n'):
         if not entry:
             continue
         for line in entry.split("\n"):
             elements = line.split()
             if len(elements) < 4:
                 continue
             if len(elements) == 5:
                 lex_list = word_to_features(elements[0])
                 fset = lex_search(lex_list, {}, self.alltrees)
                 for cata in fset:
                     if elements[4] in fset[cata]:
                         tree = fset[cata][elements[4]]
                         tree.lexicalize()
                         tree._lex = False
                         self.initree[elements[0]] = tree
                         self.ininame[elements[0]] = elements[4]
             self.relmap[elements[0]] = elements[1]
             self.tagmap[elements[0]] = elements[3]
             filtdata += "\t".join(elements[:4])
             filtdata += '\n'
         filtdata += "\n\n"
     self.graphs = [DependencyGraph(entry)
           for entry in filtdata.split('\n\n')[:-1] if entry]
     trees = [graph.tree() for graph in self.graphs]
     count = 0
     for t in trees:
         key = 'tree '+ str(count)
         self._import_trees[key] = t
         count += 1
     self.populate_tree('', self._import_trees)
     self._import_trees_selected = True
     self.init_selector(trees[0])

コード例 #15

0

ファイルを表示

date = sys.argv[2]
article = 1
total_avg_polarity = 0

while (article <= 5):

    file = stock + '/' + date + '_2016/' + str(article) + '.txt'

    try:
        fp = open(file)
        article += 1
        data = fp.read()

        # print sentences

        title = data.split('\n', 1)[0]

        content = data

        # Create a SummaryTool object
        st = SummaryTool()

        # Build the sentences dictionary
        sentences_dic = st.get_sentences_ranks(content)

        # Build the summary with the sentences dictionary
        sentences = st.get_summary(title, content, sentences_dic)

        # print sentences

        ## instantiate senticnet

コード例 #16

0

ファイルを表示

ファイル: BEscores.py プロジェクト: shraddhansahula/complexQA

root = inp.getroot()

candidates = []
for child in root.iter():
	if child.tag == "P":
		
		child = tokenizer.tokenize(child.text)
		for i in child:
			candidates.append(i)


data = "Discuss conditions on American Indian reservations or among Native American communities. Include the benefits and drawbacks of the reservation system. Include legal privileges and problems."
print "Query Sentence"
print data
print "\n"
data = data.split(" ")
queryRel = [] 
for word in data: 
	for i,j in enumerate(wn.synsets(word)):
		for l in j.lemmas():
			queryRel.append(l.name())
		#queryRel.append(l.lemma_names() for l in j.hypernyms())
		for l in j.hypernyms():
			for k in l.lemma_names():
				queryRel.append(k)
		for l in j.hyponyms():
			for k in l.lemma_names():
				queryRel.append(k)


def LLR(e):

コード例 #17

0

ファイルを表示

ファイル: markov.py プロジェクト: rotated8/idle-chains

 def file_to_words(self):
     data = None
     with open(self.corpus_file) as corpus:
         data = corpus.read()
     words = data.split()
     return words

コード例 #18

0

ファイルを表示

def get_raw_paragraph(
    fileid
):  #TODO test if this works with yahoo! corpus as well (encoding might differ)
    data = corpus.raw(fileid)
    return data.split(u"\r\n \r\n")

コード例 #19

0

ファイルを表示

ファイル: readAndCombine.py プロジェクト: kgpavinash/drugvision

#     print(x)

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data = f.read()
result = ('\n------\n'.join(tokenizer.tokenize(data)))
# print(result)
# data = f.read()

# for e in firstprocess:
#     # print(e)
#     # print(e.index('\n'))
#     s = e.split()
#     print(s)
#     break
allSentences = []
data2 = data.split()
newWord = False
upperCaseWord = False
lowerCaseWord = False
currentSentence = ""
for e in data2:
    if e.isupper():
        upperCaseWord = True
        if lowerCaseWord is True:
            newWord = True
            allSentences.append(currentSentence)

        currentSentence = currentSentence + ' ' + e
    if e.islower():
        if upperCaseWord is True:
            newWord = True

コード例 #20

0

ファイルを表示

ファイル: extract.py プロジェクト: dbtmindset/twitter

import nltk.data
import nltk
import language_check
import string
import re

tool = language_check.LanguageTool('en-US')
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

lines = []
fp = open("./data/dbt_handouts_pass1.txt")
data = fp.read()
data = data.replace("\n", ' ').replace('•• ',
                                       '').replace('*',
                                                   '').replace('Other:', '')
data = re.sub(r'\d+', ' ', data)
data = re.sub(r'[^\x00-\x7f]', r'', data)
data = ' '.join(data.split())

for x in tokenizer.tokenize(data):
    matches = tool.check(x)
    words = x.split(' ')
    if len(matches) == 0 and len(words) >= 8:
        lines.append(x)

print('\n'.join(lines))
f = open("./data/dbt_handouts_pass2.txt", "w")
f.write('\n'.join(tokenizer.tokenize(data)))
f.close()

コード例 #21

0

ファイルを表示

ファイル: precision.py プロジェクト: psingh94561/Context-Based-text-search-with-python

num=0
for fileName in list_of_files:
    s=[]
    p=[]
    r=[]
    t=[]
    q=[]
    a=[]
    fin=open(fileName,"r")
    data=fin.read()
    print(fileName,"check3")
    print(data)
    #inp=input("enter your opinion:")
    inp=input("press 1 for sports 2 for education 3 for entertainment 4 for politics 5 for business")
    print(inp)
    for line in data.split('\n'):
        #print(line)
        r.append(line)
        #print(type(s))
        s.append(word_tokenize(line))
        #print(word_tokenize(line))
        fin.close() #closes file
#print(s) 
    for i in range(len(s)):
        r.append(i)
#print(r)

    t = word_tokenize(str(r))
    print(t)
    p = nltk.corpus.stopwords.words('english')
    #print(p)

コード例 #22

0

ファイルを表示

                continue
        except:
            continue
        try:
            if (len(note.get_text().split())) == 1:
                footNote = note.attrs['n'] + ' ' + note.get_text()
                dic[note.attrs['n']] = note.get_text()
            if (len(note.get_text().split())) > 1:
                footNote = note.attrs['n'] + ' ' + note.get_text()
                for word in footNote.split():
                    #if word[:5] == 'https':
                    if 'htt' in word:
                        data = data + word + ' '
                    if word.isdigit():
                        data = data + word + ' '
                for multi in data.split():
                    if multi.isdigit():
                        key = multi
                        continue
                    dic[key] = multi
                    listXMLfile.append(filenameXML)
                    outputCSV_writer.writerow([filenameXML, dic[key]])
                    print(count)

        except:
            continue
    #print(dic)
    '''
	#// extracting the sentance 
	cleanSentence = None
	sentences = split_into_sentences(contents)

コード例 #23

0

ファイルを表示

ファイル: syl.py プロジェクト: c-forster/scansion

cmu = cmudict.dict()
# webster = loadWebster(webster)

################## open file ##################
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
fp = open("sample.txt")
# fp = open("collier.txt")
# fp = open("frost_woods.txt")
# fp = open("pope_windsor_forest.txt")
# fp = open("paradise_lost.txt")
# fp = open("sonnet_xv.txt")
# fp = open("thomson_seasons.txt")
data = fp.read()

data = data.split('\n') ## line breaking.

# exclude = set(string.punctuation)
exclude = set('!"#$%&()*+,./:;<=>?@[\\]^_`{|}~')
  # exclude all string.punctuation except apostrophe (I think?)
  # and hyphen (29 may 2013)
  # note that i remove all of string.punctuation at the end of
  # the replED function
lines = []  #to store the poem

for datum in data:
  '''
  This is really ugly, but, I needed to replace -'d endings
  with -ed endings, and the only place I could think of doing
  it was when first creating the lines.
  As this loop starts, apostrophes should be the only punctuation

コード例 #24

0

ファイルを表示

ファイル: markov.py プロジェクト: rotated8/idle-chains

 def file_to_words(self):
     data = None
     with open(self.corpus_file) as corpus:
         data = corpus.read()
     words = data.split()
     return words