def plaintext_to_conll(inpath, postag=False, lemmatise=False, lang='en', metadata=False, outpath=False, nltk_data_path=False, speaker_segmentation=False): """ Take a plaintext corpus and sent/word tokenise. :param inpath: The corpus to read in :param postag: do POS tagging? :param lemmatise: do lemmatisation? :param lang: choose language for pos/lemmatiser (not implemented yet) :param metadata: add metadata to conll (not implemented yet) :param outpath: custom name for the resulting corpus :param speaker_segmentation: did the corpus has speaker names? """ import nltk import shutil import pandas as pd from corpkit.process import saferead from corpkit.build import get_filepaths fps = get_filepaths(inpath, 'txt') # IN THE SECTIONS BELOW, WE COULD ADD MULTILINGUAL # ANNOTATORS, PROVIDED THEY BEHAVE AS THE NLTK ONES DO # SENT TOKENISERS from nltk.tokenize.punkt import PunktSentenceTokenizer stoker = PunktSentenceTokenizer() s_tokers = {'en': stoker} sent_tokenizer = s_tokers.get(lang, stoker) # WORD TOKENISERS tokenisers = {'en': nltk.word_tokenize} tokeniser = tokenisers.get(lang, nltk.word_tokenize) # LEMMATISERS if lemmatise: from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() lemmatisers = {'en': lmtzr} lemmatiser = lemmatisers.get(lang, lmtzr) # POS TAGGERS if postag: # nltk.download('averaged_perceptron_tagger') postaggers = {'en': nltk.pos_tag} tagger = postaggers.get(lang, nltk.pos_tag) # iterate over files, make df of each, convert this # to conll and sent to new filename for f in fps: for_df = [] data, enc = saferead(f) plain, enc = saferead(f.replace('-stripped', '')) #orig_data = data #data, offsets = process_meta(data, speaker_segmentation, metadata) #nest = [] sents = sent_tokenizer.tokenize(data) soffs = sent_tokenizer.span_tokenize(data) toks = [tokeniser(sent) for sent in sents] ser = nested_list_to_pandas(toks) for_df.append(ser) if postag or lemmatise: postags = pos_tag_series(ser, tagger) if lemmatise: lemma = lemmatise_series(ser, postags, lemmatiser) for_df.append(lemma) for_df.append(postags) else: if postag: for_df.append(postags) df = pd.concat(for_df, axis=1) fo = new_fname(f, inpath) write_df_to_conll(df, fo, metadata=metadata, plain=plain, stripped=data, speaker_segmentation=speaker_segmentation, offsets=soffs) nsent = len(set(df.index.labels[0])) print('%s created (%d sentences)' % (fo, nsent)) if '-stripped' in inpath: return inpath.replace('-stripped', '-tokenised') else: return inpath + '-tokenised'
def sent_pos(in_dir): """ Positions of citation markers in sentences, relatve to where in doc """ arxiv_base_url = 'http://export.arxiv.org/api/query?search_query=id:' arxiv_ns = { 'atom': 'http://www.w3.org/2005/Atom', 'opensearch': 'http://a9.com/-/spec/opensearch/1.1/', 'arxiv': 'http://arxiv.org/schemas/atom' } punkt_param = PunktParameters() abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf'] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) with open('hedge_words') as f: hedge_words = [l.strip() for l in f.readlines()] x_all = list(range(-5, 6)) y_verb = [] y_noun = [] y_propnoun = [] y_prepos = [] y_adj = [] y_wh = [] y_adv = [] y_pr = [] y_form = [] y_fig = [] y_tab = [] for x in x_all: y_verb.append(0) y_noun.append(0) y_propnoun.append(0) y_prepos.append(0) y_adj.append(0) y_wh.append(0) y_adv.append(0) y_pr.append(0) y_form.append(0) y_fig.append(0) y_tab.append(0) file_names = os.listdir(in_dir) for file_idx, fn in enumerate(file_names): if file_idx % 100 == 0: print('{}/{}'.format(file_idx, len(file_names))) path = os.path.join(in_dir, fn) aid, ext = os.path.splitext(fn) if ext != '.txt' or aid == 'log': continue phys_cat = [ 'hep-th', 'hep-ph', 'hep-lat', 'hep-ex', 'cond-mat', 'astro-ph', 'physics', 'nucl', 'gr-qc', 'quant-ph', 'nlin' ] math_cat = ['math', 'math-ph'] cs_cat = ['cs'] if re.search(r'[a-z]', aid): split = re.search(r'[a-z][0-9]', aid).span()[0] + 1 aid = aid[:split] + '/' + aid[split:] resp = requests.get('{}{}&start=0&max_results=1'.format( arxiv_base_url, aid)) xml_root = etree.fromstring(resp.text.encode('utf-8')) result_elems = xml_root.xpath('/atom:feed/atom:entry', namespaces=arxiv_ns) result = result_elems[0] cat = result.find('arxiv:primary_category', namespaces=arxiv_ns).get('term') high_cat = None for pc in phys_cat: if pc in cat: high_cat = 'phys' break if not high_cat: for mc in math_cat: if pc in cat: high_cat = 'math' break if not high_cat: if 'cs' in cat: high_cat = 'cs' if not high_cat: continue if high_cat != 'phys': continue with open(path) as f: text = f.read() marker = ' \u241F ' doc_len = len(text) for sent_idx, sent_edx in tokenizer.span_tokenize(text): sentence_orig = text[sent_idx:sent_edx] sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig) sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence) if marker in sentence: words = pos_tag(sentence.split()) words = [w for w in words if re.search(r'[\w|\u241F]', w[0])] sent_len = len(words) indices = [ i for i, tup in enumerate(words) if tup[0] == marker.strip() ] for word_idx in indices: word = words[word_idx][0] if word == marker.strip(): for shift in x_all: x_idx = shift + 5 if shift == 0: # marker itself continue if word_idx+shift < 0 or \ word_idx+shift >= len(words): # out of range continue wrd = words[word_idx + shift][0] pos = words[word_idx + shift][1] if 'V' in pos: y_verb[x_idx] += 1 if pos in ['NN', 'NNS']: y_noun[x_idx] += 1 if pos in ['NNP', 'NNPS']: y_propnoun[x_idx] += 1 if pos == 'IN': y_prepos[x_idx] += 1 if 'JJ' in pos: y_adj[x_idx] += 1 if 'W' in pos: y_wh[x_idx] += 1 if 'RB' in pos: y_adv[x_idx] += 1 if 'PR' in pos: y_pr[x_idx] += 1 if wrd == 'FORMULA': y_form[x_idx] += 1 if wrd == 'FIGURE': y_fig[x_idx] += 1 if wrd == 'TABLE': y_tab[x_idx] += 1 if file_idx > 200: break for idx, y in enumerate([(y_verb, 'verb'), (y_noun, 'noun'), (y_propnoun, 'proper noun'), (y_prepos, 'preposition'), (y_adj, 'adjective'), (y_wh, 'wh-det./-adv./-pron.'), (y_adv, 'adverb'), (y_pr, 'pers./pos. pronoun'), (y_form, 'formula')]): color = list(mpl.rcParams['axes.prop_cycle'])[idx]['color'] plt.plot(x_all, y[0], marker='', linestyle='-', linewidth=.5, alpha=0.3, color=color) plt.plot(x_all, y[0], label=y[1], marker='D', linestyle='', color=color) plt.xlabel('word position relative to citation') plt.ylabel('number of words') plt.legend() ax = plt.gca() ax.xaxis.grid(True) plt.xticks(np.arange(min(x_all), max(x_all), 1.0)) plt.show()
train = False if train: with gzip.open("en_corp", 'rt', encoding='utf-8') as encorp, gzip.open( "de_corp", 'rt', encoding='utf-8') as decorp: text_en = encorp.read() text_de = decorp.read() trainer_en = PunktTrainer() trainer_en.INCLUDE_ALL_COLLOCS = True trainer_en.train(text_en) trainer_de = PunktTrainer() trainer_de.INCLUDE_ALL_COLLOCS = True trainer_de.train(text_de) tokenizer_en = PunktSentenceTokenizer(trainer_en.get_params()) tokenizer_de = PunktSentenceTokenizer(trainer_de.get_params()) else: #tokenizer_en=PunktSentenceTokenizer() #tokenizer_de=PunktSentenceTokenizer() #nltk.download('punkt') tokenizer_en = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle') mismatch = 0 with gzip.open(sys.argv[1], 'rt', encoding='utf-8') as filtered: for line in filtered: tabs = line.split('\t') line_src = tabs[3] line_tgt = tabs[4] sent_src = tokenizer_en.tokenize(line_src)
def __init__(self): self.language = "latin" self.punkt_param = PunktParameters() self.punkt_param.abbrev_types = set(ABBREVIATIONS) self.sent_tokenizer = PunktSentenceTokenizer(self.punkt_param) self.word_tokenizer = LatinLanguageVars()
#edit this when changind dirs LangPaths = os.path.realpath( "C:/users/rihanna/Documents/Pol/ThesisIt/SumMe/Summarizer/langdetector/profiles/" ) tltagger = nltk.data.load("taggers/filipino_aubt.pickle") #filipino pos tagger tlChunker = nltk.data.load( "chunkers/filipino_ub.pickle") #filipino chunker here enChunker = nltk.data.load("chunkers/conll2000_ub.pickle") #enChunkerhere punkt_param = PunktParameters() #creates an opening for tokenizer parameters. punkt_param.abbrev_types = set(['gng', 'mr', 'mrs', 'dr', 'rep' ]) #abbreviations further accepted goes here sentence_splitter = PunktSentenceTokenizer(punkt_param) tokenized = "" gateway = JavaGateway() detector = gateway.entry_point detector.init(LangPaths) def LangDetect(str): return detector.detect(str) def tokenizer(str): #print(wordpunct_tokenize(str)) return wordpunct_tokenize(str)
def tokenize_latin_words(string): """ Tokenizer divides the string into a list of substrings >>> from cltk.corpus.utils.formatter import remove_non_ascii >>> text = 'Dices ἐστιν ἐμός pulchrum esse inimicos ulcisci.' >>> tokenize_latin_words(text) ['Dices', 'ἐστιν', 'ἐμός', 'pulchrum', 'esse', 'inimicos', 'ulcisci', '.'] :param string: This accepts the string value that needs to be tokenized :returns: A list of substrings extracted from the string """ from cltk.tokenize.latin_exceptions import latin_exceptions assert isinstance(string, str), "Incoming string must be type str." def matchcase(word): # From Python Cookbook def replace(m): text = m.group() if text.isupper(): return word.upper() elif text.islower(): return word.lower() elif text[0].isupper(): return word.capitalize() else: return word return replace replacements = [(r'mecum', 'cum me'), (r'tecum', 'cum te'), (r'secum', 'cum se'), (r'nobiscum', 'cum nobis'), (r'vobiscum', 'cum vobis'), (r'quocum', 'cum quo'), (r'quacum', 'cum qua'), (r'quicum', 'cum qui'), (r'quibuscum', 'cum quibus'), (r'sodes', 'si audes'), (r'satin', 'satis ne'), (r'scin', 'scis ne'), (r'sultis', 'si vultis'), (r'similist', 'similis est'), (r'qualist', 'qualis est') ] for replacement in replacements: string = re.sub(replacement[0], matchcase(replacement[1]), string, flags=re.IGNORECASE) punkt_param = PunktParameters() abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop'] punkt_param.abbrev_types = set(abbreviations) sent_tokenizer = PunktSentenceTokenizer(punkt_param) word_tokenizer = PunktLanguageVars() sents = sent_tokenizer.tokenize(string) enclitics = ['que', 'n', 'ue', 've', 'st'] exceptions = enclitics exceptions = list(set(exceptions + latin_exceptions)) tokens = [] for sent in sents: temp_tokens = word_tokenizer.word_tokenize(sent) # Need to check that tokens exist before handling them; needed to make stream.readlines work in PlaintextCorpusReader if temp_tokens: if temp_tokens[0].endswith('ne'): if temp_tokens[0].lower() not in exceptions: temp = [temp_tokens[0][:-2], '-ne'] temp_tokens = temp + temp_tokens[1:] if temp_tokens[-1].endswith('.'): final_word = temp_tokens[-1][:-1] del temp_tokens[-1] temp_tokens += [final_word, '.'] for token in temp_tokens: tokens.append(token) # Break enclitic handling into own function? specific_tokens = [] for token in tokens: is_enclitic = False if token.lower() not in exceptions: for enclitic in enclitics: if token.endswith(enclitic): if enclitic == 'n': specific_tokens += [token[:-len(enclitic)]] + ['-ne'] elif enclitic == 'st': if token.endswith('ust'): specific_tokens += [token[:-len(enclitic) + 1]] + ['est'] else: specific_tokens += [token[:-len(enclitic)]] + ['est'] else: specific_tokens += [token[:-len(enclitic)]] + ['-' + enclitic] is_enclitic = True break if not is_enclitic: specific_tokens.append(token) return specific_tokens
def __init__(self): self.name = "Gale Church Alignment Scorer" self.tokenizer = PunktSentenceTokenizer() self.sblocks, self.tblocks = [], []
def tokenize(document): doc_tokenizer = PunktSentenceTokenizer() sentences_list = doc_tokenizer.tokenize(document) return sentences_list
def main(sysargs): sys.argv = sysargs arg_parser = argparse.ArgumentParser( description='Formats debates by removing HTML and filtering words.') arg_parser.add_argument('-i', '--infile', required=True, help='Debate file to format.') args = arg_parser.parse_args() # Initialize nltk elements. parser = SpeechHTMLParser() sent_splitter = PunktSentenceTokenizer() tokenizer = TreebankWordTokenizer() tagger_loc = '/het/users/jengi/stanford-postagger/' tagger = StanfordTagger(tagger_loc + 'models/wsj-0-18-bidirectional-distsim.tagger', \ tagger_loc + 'stanford-postagger.jar') stemmer = SnowballStemmer('english') # Read infile. speaker_pattern = re.compile('.*:') null_pattern = re.compile('\s*(\[[^\]]*\]|\([^\)]*\))') dash_pattern = re.compile('\S+(--)\s+') ellipse_pattern = re.compile('\s*\.\.\.\s*') noun_tags = ['NN', 'NNS', 'NNP', 'NNPS'] punct = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', \ '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', \ '\\', ']', '^', '_', '`', '{', '|', '}', '~'] block_lengths = [] with open(args.infile, 'r') as afile: file_contents = afile.read() parser.feed(file_contents) parser.close() num_blocks = 0 speeches = {} for (speaker, block) in parser.text: if num_blocks % 10 == 0: print >> sys.stderr, 'Processing block ' + str( num_blocks) + ' ...' orig_block = block # Remove applause, laughter, etc. block = repeated_search(block, null_pattern, 0) # Remove -- from the end of words. (Indicates stuttering / stopping.) block = repeated_search(block, dash_pattern, 1) # Do more complex tokenization. sents = sent_splitter.tokenize(block) sents = [ellipse_pattern.sub(' ... ', sent) for sent in sents] tokens = [tokenizer.tokenize(sent) for sent in sents] # Run POS tagger and keep only nouns. # Also lowercase and stem these nouns. tags = [tagger.tag(toks) for toks in tokens] tokens = [] tagged_text = [] for sent in tags: tokens.append([]) for (word, tag) in sent: tagged_text.append(word) tagged_text.append(tag) if tag in noun_tags: tokens[len(tokens) - 1].append( stemmer.stem(word.lower())) # Remove any "sentences" that are actually empty and # any tokens that are pure punctuation. for i in reversed(range(len(tokens))): for j in reversed(range(len(tokens[i]))): non_punct = ''.join( [tok for tok in tokens[i][j] if tok not in punct]) if len(non_punct) == 0: del tokens[i][j] if len(tokens[i]) == 0: del tokens[i] # Make sure there is still at least one sentence left. num_sents = len(tokens) if num_sents == 0: continue # Add block to speeches dictionary. speaker = speaker[:speaker_pattern.match(speaker).end() - 1] if speaker not in speeches: speeches[speaker] = [] speeches[speaker].append(orig_block) speeches[speaker].append(' '.join(tagged_text)) speeches[speaker].append('\n'.join( [' '.join(sent) for sent in tokens])) #print speeches[speaker][0] #print speeches[speaker][1] #print speeches[speaker][2] num_blocks += 1 num_tokens = 0 for toks in tokens: num_tokens += len(toks) block_lengths.append(num_tokens) # Save each speaker's text to a file. (infolder, basename) = os.path.split(os.path.abspath(args.infile)) out_prefix = infolder + '/' out_suffix = basename for speaker in speeches: # Create outfile prefixed by speaker's name. outfile = open(out_prefix + speaker + '-' + out_suffix, 'w') # Save text to outfile. blocks = speeches[speaker] for i in range(0, len(blocks), 3): print >> outfile, blocks[i] print >> outfile, blocks[i + 1] print >> outfile, blocks[i + 2] print >> outfile outfile.close() print '# of blocks: ' + str(num_blocks) print 'Mean # of tokens (per block): ' + str(scipy.mean(block_lengths)) print 'Median # of tokens: ' + str(scipy.median(block_lengths)) print 'Standard deviation in # of tokens: ' + str(scipy.std(block_lengths))
def gather_input(): #gather input for file in os.listdir("../scrapper/"): if file.endswith(".txt"): inputFile = file file = open("../scrapper/"+inputFile,"r") input = file.read() file.close() #os.remove("../scrapper/"+inputFile) #extract text reg_string=ur"\"text\":\"(.+?)[^\\]\"" data_array=re.findall(reg_string,input) #extract location of tweet reg_string=ur"\"location\":\"(.*?)\"" location_array=re.findall(reg_string,input) #extract whether retweeted or not reg_string = ur"\"retweeted\":(.+?)," retweet_bool=re.findall(reg_string,input) #today's date in YYYYMMDD format date = datetime.datetime.now() date = date.date() #date = date.strftime("%Y%m%d") ## calcualte the barrier date date_diff = int(sys.argv[1]) DD = datetime.timedelta(days=date_diff) barrier_date = (datetime.datetime.now()- DD).date() ## load the whitelist and create array of arrays as - [noun,sentiment,count] file = open("../py_code/white_list.txt","r") white_list = [] line = file.readline() while line: white_list.append([line.rstrip(),0,0]) line = file.readline() file.close() ## create a sentence_tokenizer from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20']) sent_tokenizer = PunktSentenceTokenizer(punkt_param) ## next step is to inject into the database db = MySQLdb.connect(host="localhost",user="******",passwd="{2qGq(22+5iU",db="Insights") cur = db.cursor() ##filter out those tweet which have prices in them - usually sales, or retweets i=0 for text in data_array: if retweet_bool[i]!="false": pass else: ##filter text as many users dont put space after full stop - which is essential to use sentence tokenizer data_array[i] = re.sub(r'([\.\?\!])(\w)', r'\1 \2', data_array[i]) blob = TextBlob(data_array[i]) blob_sentiment = int(blob.sentiment.polarity*1000)/1000.0 sql = "INSERT INTO Phrases(Phrase,Sentiment,Location,Date) VALUES (\""+data_array[i]+"\", "+str(blob_sentiment)+", \""+location_array[i]+"\", \""+str(date)+"\")" cur.execute(sql) ## tokenize the tweets, for sentiment analysis sentences = sent_tokenizer.tokenize(data_array[i]) if len(sentences) == 1: ##run through the whiteList array, for each find count, add count, sentiment to array for word in white_list: if((sentences[0].lower()).find(word[0])!=-1): word[1]=word[1]+blob_sentiment word[2]=word[2]+1 else: for sentence in sentences: ##run through the whiteList array, for each find count and sentiment, add count, sentiment to array for word in white_list: if((sentence.lower()).find(word[0])!=-1): blob = TextBlob(sentence) word[1]=word[1]+int(blob.sentiment.polarity*1000)/1000.0 word[2]=word[2]+1 i=i+1 db.commit() ### now integerate these into Sentiment db, if there is no entry for today insert phrase and create one sql = "SELECT * FROM Sentiment WHERE `Date` ='"+str(date)+"' LIMIT 1;" cur.execute(sql) if(cur.rowcount==0): for word in white_list: if(word[2]!=0): sql = "INSERT INTO Sentiment VALUES ('"+str(date)+"','"+word[0]+"','"+str(word[1])+"','"+str(word[2])+"');" cur.execute(sql) ### else get the entry in the table, add sentiment and count, store back else: for word in white_list: if(word[2]!=0): sql = "SELECT Sentiment,Count FROM Sentiment WHERE `Date` ='"+str(date)+"'AND `Phrase`='"+word[0]+"';" cur.execute(sql) for row in cur.fetchall(): new_sentiment = float(row[0])+word[1] new_count = row[1]+word[2] sql = "UPDATE Sentiment SET `Sentiment`="+str(new_sentiment)+",`Count`="+str(new_count)+" WHERE `Date` ='"+str(date)+"'AND `Phrase`='"+word[0]+"';" cur.execute(sql) db.commit() ### now add all the sentiment and count for all phrases in the white list in the Sentiment db above the barrier_date, add to json those whose count is not zero total_sentiment = 0; total_count = 0; json_array = []; for word in white_list: sql = "SELECT Sentiment,Count FROM Sentiment WHERE `Date` >'"+str(barrier_date)+"'AND `Phrase`='"+word[0]+"';" cur.execute(sql) if(cur.rowcount!=0): for row in cur.fetchall(): total_sentiment = total_sentiment+float(row[0]) total_count = total_count+int(row[1]) json_array.append({"noun": word[0], "sentiment": int(total_sentiment/total_count*1000)/1000.0, "count": total_count}) total_sentiment = 0; total_count = 0; db.close() print(json.dumps(json_array))
def process_text_list(seg, text_list, new_json, zone): for text_part in text_list: if "text" in text_part: the_sentences = seg.segment(text_part["text"]) sentences = [] for the_span in the_sentences: span = {} span["start"] = the_span.start span["end"] = the_span.end sentences.append(span) # check if result is acceptable valid_segmentation = validate_segmentation(sentences) if not valid_segmentation: # fall back to NLTK sentences = [] for start, end in PunktSentenceTokenizer().span_tokenize(text_part["text"]): span = {} span["start"] = start span["end"] = end sentences.append(span) offset_pos = 0 # the following is to cancel a sentence segmentation because it is located in the middle of an existing span # if previous_start is -1, previous segmentation was correct previous_start = -1 for span in sentences: if previous_start != -1: span["start"] = previous_start previous_start = -1 offset_pos = span["start"] sentence_structure = OrderedDict() sentence_structure["text"] = text_part["text"][span["start"]:span["end"]] if "section" in text_part: sentence_structure["section"] = text_part["section"] if "paragraph_rank" in text_part: sentence_structure["paragraph_rank"] = text_part["paragraph_rank"] if "section_rank" in text_part: sentence_structure["section_rank"] = text_part["section_rank"] if "ref_spans" in text_part: new_ref_spans = [] for ref_span in text_part["ref_spans"]: # check if we have a segmentation in the middle of a ref span if ref_span["start"] >= offset_pos and ref_span["start"] < span["end"] and ref_span["end"] > span["end"]: """ print("\nwarning, segmentation in the middle of ref span: sentence at", span["start"], span["end"], "with ref at", ref_span["start"], ref_span["end"]) print("sentence:", text_part["text"][span["start"]:span["end"]]) print("ref:", text_part["text"][ref_span["start"]:ref_span["end"]]) print("\n") """ # in this case, we cancel this sentence boundary previous_start = span["start"] break if ref_span["start"] >= offset_pos and ref_span["end"] <= span["end"]: new_ref_span = OrderedDict() new_ref_span["start"] = ref_span["start"] - offset_pos new_ref_span["end"] = ref_span["end"] - offset_pos if "type" in ref_span: new_ref_span["type"] = ref_span["type"] if "ref_id" in ref_span: new_ref_span["ref_id"] = ref_span["ref_id"] if "text" in ref_span: new_ref_span["text"] = ref_span["text"] new_ref_spans.append(new_ref_span) if len(new_ref_spans) > 0 and previous_start == -1: sentence_structure["ref_spans"] = new_ref_spans if "entity_spans" in text_part and previous_start == -1: new_entity_spans = [] for entity_span in text_part["entity_spans"]: # check if we have a segmentation in the middle of an entity span if entity_span["start"] >= offset_pos and entity_span["start"] < span["end"] and entity_span["end"] > span["end"]: """ print("\nwarning, segmentation in the middle of entity span: sentence at", span["start"], span["end"], "with entity at", entity_span["start"], entity_span["end"]) print("sentence:", text_part["text"][span["start"]:span["end"]) print("entity:", text_part["text"][entity_span["start"]:entity_span["end"]]) print("\n") """ # in this case, we cancel this sentence boundary previous_start = span["start"] break if entity_span["start"] >= offset_pos and entity_span["end"] <= span["end"]: new_entity_span = OrderedDict() new_entity_span["start"] = entity_span["start"] - offset_pos new_entity_span["end"] = entity_span["end"] - offset_pos if "type" in entity_span: new_entity_span["type"] = entity_span["type"] if "rawForm" in entity_span: new_entity_span["rawForm"] = entity_span["rawForm"] if "resp" in entity_span: new_entity_span["resp"] = entity_span["resp"] if "used" in entity_span: new_entity_span["used"] = entity_span["used"] if "id" in entity_span: new_entity_span["id"] = entity_span["id"] if "cert" in entity_span: new_entity_span["cert"] = entity_span["cert"] new_entity_spans.append(new_entity_span) if len(new_entity_spans) > 0 and previous_start == -1: sentence_structure["entity_spans"] = new_entity_spans if previous_start == -1: new_json[zone].append(sentence_structure)
def _finalize_sent_tokenizer(self): """Re-instantiate sentence tokenizer to ensure has updated params.""" self._sent_tokenizer = PunktSentenceTokenizer( self._sent_trainer.get_params())
from nltk.tokenize.punkt import PunktSentenceTokenizer from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer import networkx as nx import pylab as plt open_toi = open('/Users/aj/Documents/TOI_data3.txt', encoding='utf-8', mode='r+') read_toi = open_toi.read() #Tokenzation sen_token = PunktSentenceTokenizer() tokens = sen_token.tokenize(read_toi) #TF-IDF matrix = CountVectorizer(stop_words=None).fit_transform(tokens) print("transform matrix:\n") print(matrix) norm = TfidfTransformer().fit_transform(matrix) print("normalized:\n", norm) print("\n") print("normalizer.T:\n", norm.T) print("\n") #similarity between sentences similarity = norm * norm.T print("similarity graph:\n") print(similarity) print("similarity Matrix:\n") print(similarity.toarray())
def performSentenceSegmentation(file_content): #Training the model using given text: unsupervised learning tokenizer = PunktSentenceTokenizer() tokenizer.train(file_content) sentence_segmentation = tokenizer.tokenize(file_content) return sentence_segmentation
def summarize(self): sents = [] sentence_tags_dict = {} sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sentence_tokenizer = PunktSentenceTokenizer() for document in self.documents: for sent in sent_detector.sentences_from_text(document): np_extractor = NPExtractor(sent) result = np_extractor.extract() index = len(sents) for tag in result: # print tag tag = tag.lower() # print tag if tag in sentence_tags_dict.keys(): value = sentence_tags_dict[tag] # print "Found", tag, value value.append(index) sentence_tags_dict[tag] = value # else: # print value else: sentence_tags_dict[tag] = [ index, ] # print "Set", tag, sentence_tags_dict[tag] # print "This sentence is about: %s" % ", ".join(result) sents.append(sent) cv = CountVectorizer() bow_matrix = cv.fit_transform(sents) features = cv.get_feature_names() selected_sents = set() for feature in features: # print feature if feature in sentence_tags_dict.keys(): # print "FOUND FEATURE" # print feature if sentence_tags_dict[feature]: for index in sentence_tags_dict[feature]: selected_sents.add(sents[index]) else: pass # print len(sents) # print len(selected_sents) # print "Documents", len(self.documents) # print "Cosine Similarity" # self.cosine_similarity(sents) # print "\n\nAll Sentences Summary\n\n" # self.generate_summary(sents) # print "\n\nSelected Sentences\n" summaries, removed_sentences = self.generate_summary(sents) # self.document_summaries(summaries) # print removed_sentences return self.document_summaries(summaries)
def tokenize_all(doc): # insert the check statement here sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(doc) return sentences
from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize # from nltk.tokenize import PunktWordTokenizer from nltk.tokenize.punkt import PunktSentenceTokenizer from nltk.tokenize import WordPunctTokenizer text = ("Are you curious about tokenization? " + "Let's see how it works! " + "We need to analyze a couple of " + "sentences with punctuations to see it in action.") sent_tokenize_list = sent_tokenize(text) print("Sentence tokenizer:") print(sent_tokenize_list) print("Word tokenizer:") print(word_tokenize(text)) # Create a new punkt word tokenizer punkt_sent_tokenizer = PunktSentenceTokenizer() print("Punkt word tokenizer:") print(punkt_sent_tokenizer.tokenize(text)) word_punct_tokenizer = WordPunctTokenizer() print("Word punct tokenizer:") print(word_punct_tokenizer.tokenize(text))
def _identify_keywords(self, key_terms=None): sent_tokenizer = PunktSentenceTokenizer() reflection_sentences = sent_tokenizer.tokenize(self.reflection) word_tokenizer = nltk.tokenize.word_tokenize for sentence in reflection_sentences: sent = word_tokenizer(sentence.lower()) if key_terms is None: tokenized_sentence = [(x[0], util.clean_word(x[1])) for x in enumerate(sent)] relevant_data = Data.adjectives + Data.substantives result_obj = {'sentence': sentence, 'terms': []} terms_found = self._find_terms(tokenized_sentence, relevant_data) if terms_found: result_obj['terms'].append(terms_found) self.results.append(result_obj) else: for key_term in key_terms: # make an enumerable list, so I have the index of the splitted words changing the enumerable object to list # esse é para o caso de existir o mesmo verbo mais de uma vez na mesma frase key_term_indexes = [(i, v) for i, v in tokenized_sentence if v == key_term['word']] if not key_term_indexes: continue for key_term_index, t in key_term_indexes: result_obj = { 'sentence': sentence, 'terms': [key_term] } # offset para pesquisar por outros elementos ao redor dos verbos offset = 5 # surr stands for surrounding surr_words = [ (index, word) for index, word in tokenized_sentence[max(0, key_term_index - offset):key_term_index + offset] ] cleaned_surr_words = [(index, util.clean_word(word)) for index, word in surr_words] for terms in self._find_terms(surr_words, Data.substantives, clean=False): result_obj['terms'].append(terms) for terms in self._find_terms(cleaned_surr_words, Data.adjectives): result_obj['terms'].append(terms) # if we haven't found any of the substantives or adjectives, we shoudn't look for adverbs if [ x for x in result_obj['terms'] if x['type'] != 'verb' and x['type'] != 'personal_pronoun' ]: adverbs = Data.adverbs result = self._find_terms(cleaned_surr_words, Data.adverbs) for terms in result: result_obj['terms'].append(terms) self.results.append(result_obj)
def __init__(self): super(GCBlockExtractor, self).__init__(extraction_function=self._blocks_from_text) self.tokenizer = PunktSentenceTokenizer()
def annotate_text(raw_data_folder, labels_data_folder, file_to_write, max_sent_len=35, improved_sent_splitting=True, training=True): """ Creates a token-level input file for the span identification task and adds sentence IDs to the tokens. """ # max_sent_len = -1 ==> no sentence splitting if max_sent_len == -1: # the corresponding if-block can handle this improved_sent_splitting = True nlp = English() tokenizer = nlp.Defaults.create_tokenizer(nlp) if improved_sent_splitting: punkt_param = PunktParameters() punkt_param.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'ms', 'rep', 'u.s', 'feb', 'sen' ]) splitter = PunktSentenceTokenizer(punkt_param) splitter.PUNCTUATION = tuple(';:,.!?"') output_table = [] file_counter = 0 sent_no_total = 0 print("Total number of files - {}".format(len( os.listdir(raw_data_folder)))) # Reading all the files from the raw text directory article_file_names = [ file_name for file_name in os.listdir(raw_data_folder) if file_name.endswith(".txt") ] article_file_names.sort() for file_name in article_file_names: if training: label_file_name = file_name.replace(".txt", ".task2-TC.labels") print("raw_article: {}\tlabel_file: {}".format( file_name, label_file_name)) # Read the labels file with 4 columns of format # doc_id : label_of_span : idx_span_begin : idx_span_end with open(os.path.join(labels_data_folder, label_file_name), encoding="utf-8") as file: rows = file.readlines() rows = [ row.strip().split("\t") for row in rows if len(row.split("\t")) == 4 ] # Saving mappings char_idx->labels into the dictionary char_idx2label = dict() for row in rows: label = row[1] idx_from = int(row[2]) idx_to = int(row[3]) for idx in range(idx_from, idx_to): if idx not in char_idx2label.keys(): char_idx2label[idx] = [] char_idx2label[idx].append(label) else: print("raw_article: " + file_name) # Read the article and process the text with open(os.path.join(raw_data_folder, file_name), encoding="utf-8") as file: file_text = file.readlines() # Keep linebreaks for better sentence splitting file_text = ''.join([line for line in file_text]) # Normalizing punctuation marks to help the tokenizer. file_text = file_text.replace('“', '"').replace('”', '"') file_text = file_text.replace("’", "'").replace("‘", "'") sentences = [] if improved_sent_splitting: # Line breaks -> helps with headlines paragraphs = file_text.split('\n') for para in paragraphs: para = para.strip() sentences_raw = splitter.sentences_from_text(para) for sent in sentences_raw: sent = sent.strip() tokens = tokenizer(sent) if len(tokens) <= max_sent_len or max_sent_len == -1: # No need to split the sentence! if len(sent) == 0: # Can happen when paragraphs are separated by # several line breaks. continue sentences.append(sent) continue # Try splitting based on quotes. quote_fragments, all_ok = punct_based_split_sent( tokenizer, sent, max_sent_len, '"') if all_ok: sentences += quote_fragments continue # Other punctuation for splitting: ; : for quote_frag in quote_fragments: semicolon_fragments, all_ok =\ punct_based_split_sent(tokenizer, quote_frag, max_sent_len, ';') if all_ok: sentences += semicolon_fragments continue for semicolon_frag in semicolon_fragments: colon_fragments, all_ok =\ punct_based_split_sent(tokenizer, semicolon_frag, max_sent_len, ':') if all_ok: sentences += colon_fragments continue # Commas: for col_frag in colon_fragments: comma_fragments, all_ok =\ punct_based_split_sent(tokenizer, col_frag, max_sent_len, ',') if all_ok: sentences += comma_fragments continue # Last resort: # Split after max_sent_len tokens for comma_frag in comma_fragments: sentences += forcefully_split_sent( tokenizer, comma_frag, max_sent_len) else: # Cut long sentences into fragments that are (up to) # max_sent_len characters long # (the last fragment in a sentence might be shorter) file_text = file_text.replace('\n', ' ') sentences_raw = sent_tokenize(file_text) for sent in sentences_raw: sentences += forcefully_split_sent(tokenizer, sent, max_sent_len) i = 0 for sent in sentences: sent = sent.strip() i = file_text.find(sent, i) max_idx = i + len(sent) if sent == '': continue if improved_sent_splitting: if len(sent.strip()) < 2: # single char noise continue sent_no_total += 1 for token in tokenizer(sent): token = str(token) token_idx = file_text.find(token, i, max_idx) i = token_idx + len(token) output = [ file_name.replace("article", "").replace(".txt", ""), str(sent_no_total), str(token_idx), str(i), token ] if training: # Check the label of the corresponding char_idx label = char_idx2label.get(token_idx, ['None']) output.append("|".join(label)) output_table.append(output) file_counter += 1 print("Finished {} files\n".format(file_counter)) with open(file_to_write, 'w', encoding="utf-8") as f: f.write('# max_sent_len=' + str(max_sent_len) + ', improved_sent_splitting=' + str(improved_sent_splitting) + '\n') f.write('document_id\tsent_id\ttoken_start\ttoken_end\ttoken') if training: f.write('\tlabel') f.write('\n') for row in output_table: f.write('\t'.join(row) + "\n")
def ringkasan(self): array_text = [] text = ' '.join( re.sub("(@[A-Za-z1-9]+)|(\w+:\/\/\S+)", " ", self.inputTeks).split()) text = re.sub('<[^>]*>', '', text) #menghilangkan tanda baca #text = re.sub("\d+", "", inputTeks) #menghilangkan angka emoticons = re.findall('(?::|;|=)()(?:-)?(?:\)|\(|D|P)', text) #menghilangkan emoticon #text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')) #membuat semua huruf menjadi lower case array_text.append(text) for teks in array_text: document = teks doc_tokenizer = PunktSentenceTokenizer() sentences_list = doc_tokenizer.tokenize(document) cv = CountVectorizer() cv_matrix = cv.fit_transform(sentences_list) a = cv_matrix.toarray() normal_matrix = TfidfTransformer().fit_transform(cv_matrix) tfidf = normal_matrix.toarray() vektorkalimat = normal_matrix.toarray() A = vektorkalimat[0] B = vektorkalimat[2] dot = np.dot(A, B) norma = np.linalg.norm(A) normb = np.linalg.norm(B) cos = dot / (norma * normb) A = normal_matrix.T res_graph = normal_matrix * normal_matrix.T # similaritas /adjacency matrix G = res_graph.toarray() G = nx.from_numpy_matrix(np.matrix(G), create_using=nx.DiGraph) nx_graph = nx.from_scipy_sparse_matrix(res_graph) pageranks = nx.pagerank(nx_graph) sentence_array = sorted( ((pageranks[i], s) for i, s in enumerate(sentences_list)), reverse=True) sentence_array = np.asarray(sentence_array) rank_max = float(sentence_array[0][0]) rank_min = float(sentence_array[len(sentence_array) - 1][0]) temp_array = [] # Jika semua rank sama # taking any sentence will give the summary, say the first sentence flag = 0 if rank_max - rank_min == 0: temp_array.append(0) flag = 1 # If the sentence has different ranks if flag != 1: for i in range(0, len(sentence_array)): temp_array.append((float(sentence_array[i][0]) - rank_min) / (rank_max - rank_min)) print(len(temp_array)) print(temp_array) print(sentence_array[4], [0]) threshold = (sum(temp_array) / len(temp_array)) sentence_list = [] if len(temp_array) > 1: for i in range(0, len(temp_array)): if temp_array[i] > threshold: sentence_list.append(sentence_array[i][1]) else: sentence_list.append(sentence_array[0][1]) summary = " ".join(str(x) for x in sentence_list) print(summary) # save the data in another file, names sum.txt if self.modesum == "web": namaf = 'ringkasan_url.txt' else: namaf = 'ringkasan_file.txt' f = open(namaf, 'w+') #print(type(f)) f.write("\n") f.write(summary) f.close() self.akhir = summary
def tokenizeText2(text): #for abbrevations punkt_param = PunktParameters() abbreviation = abbrevations punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) return [sent for sent in tokenizer.tokenize(text) if sent not in symbols]
def __init__(self,text): self.text=text self.text = ' '.join(self.text.strip().split('\n')) self.sentence_splitter = PunktSentenceTokenizer() self.sentences = self.sentence_splitter.tokenize(text)
def get_sentence_spans(text: str): for start, end in PunktSentenceTokenizer().span_tokenize(text): yield start, end
total_correct += 1 true_predictions.append(p) print( f'\n\n\nAcc: {total_correct / len(model_predictions):.7f} ' + f'F1: {f1_score(true_labels, model_predictions, average="macro"):.7f} ' + f'F1 by classes: {" ".join( str(f1) for f1 in f1_score(true_labels, model_predictions, average=None).tolist())}' + f'Total correct {total_correct} out of {len(model_predictions)}' + f'Correct by classes: {[true_predictions.count(c) for c in list(range(num_classes))]} /' + f'{[true_labels.count(c) for c in list(range(num_classes))]}\n' ) bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) sentence_tokenizer = PunktSentenceTokenizer() def bert_tokenize(text): sentences = sentence_tokenizer.tokenize(text) text = '' for sentence in sentences: text += sentence + ' [SEP] ' tokens = [] tokens.append("[CLS]") tokens += bert_tokenizer.tokenize(text) return tokens if __name__ == '__main__': with open('config.json', 'r') as f:
def __init__(self): self.tokenizer = PunktSentenceTokenizer()
return 'Low' elif x < 255 / 1000: return 'A1' elif x < 550 / 1000: return 'A2' elif x < 785 / 1000: return 'B1' else: return 'B2+' class BulletPointLangVars(PunktLanguageVars): sent_end_chars = ('.', '?', '!', '•', '...', '|') SENT_TOKENIZER = PunktSentenceTokenizer(lang_vars=BulletPointLangVars()) TEMPLATE_POSTER_URL = 'https://s.studiobinder.com/wp-content/uploads/2017/12/Movie-Poster-Template-Light-With-Image.jpg?x81279' IMDB = pd.read_csv(processed_data_dir / 'movie_details_db.csv', dtype={'id': str}) class Movie(): ''' A movie in the Movielingo app Attributes: - title (str) - IMDB ID (str) - link to movie poster (str) - subtitle features (pandas df with NLP features) - subtitle difficulty distribution (list) - IMDB page (BeautifulSoup)
import os import re from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer from nltk.corpus import gutenberg #This file add possible abreviation into a txt file for nltk to learn in "process.py" #Put in the path of the directory containing all txt files (crawled data) directory = 'C:\\Users\\hyzha\\PycharmProjects\\NY Times World\\NY Times World' text = "" for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) abr = [] n = 1 for filename in os.listdir(directory): filepath = directory + '\\' + filename if 'text.txt' in filepath: try: with open('new.txt', 'w') as new: with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: print(n) n += 1 i = 0 for line in f: i += 1
'a', 'a.k.', 'pr', 'm', 'e', 'a.s', 'adv', 'akad', 'aklg', 'akt', 'al', 'angl', 'apyg', 'aps', 'apskr', 'asist', 'asmv', 'avd', 'atsak', 'aut', 'biol', 'b.k', 'bkl', 'bot', 'bt', 'buv', 'chem', 'd', 'dab', 'dail', 'dek', 'dėst', 'dir', 'dirig', 'doc', 'dr', 'drp', 'dš', 'e.p', 'el.p', 'egz', 'eil', 'ekon', 'el', 'e', 'etc', 'ež', 'fak', 'faks', 'filol', 'filos', 'g', 'gyv', 'G', 'gen', 'geol', 'gerb', 'gim', 'gyd', 'gv', 'įl', 'Įn', 'insp', 'inž', 'pan', 't.t', 'istor', 'k', 'Em.', 'k.a', 'kand', 'kat', 'kg', 'kyš', 'kl', 'kln', 'kn', 'koresp', 'kpt', 'kr', 'kt', 'kun', 'l.e.p', 'liet', 'ltn', 'mat', 'med', 'mėn', 'mgr', 'mgnt', 'min', 'mjr', 'mln', 'mlrd', 'mok', 'mst', 'mstl', 'N', 'nkt', 'ntk', 'nr', 'p', 'p.d', 'p.m.e', 'pav', 'pavad', 'pirm', 'pl', 'plg', 'plk', 'pr.kr', 'proc', 'prof', 'prok', 'prot', 'pss', 'pšt', 'pvz', 'r', 'red', 'rš', 'raj', 's', 'sąs', 'sav', 'saviv', 'sekr', 'sek', 'sen', 'sk', 'skg', 'skyr', 'skv', 'sp', 'spec', 'sr', 'st', 'str', 'stud', 'š.m', 'šnek', 'šv', 't', 't.y', 't.p', 'techn', 'tel', 'teol', 'tir', 'tūkst', 'tūkstm', 'up', 'upl', 'V', 'vad', 'val', 'ved', 'vet', 'vnt', 'vrš', 'vyr', 'vyresn', 'vs', 'Vt', 'vtv', 'vv', 'zool', 'žml', 'žr', 'ž.ū', 'šmt' ] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) with open(input_file) as ifile: with open(output_file, "w") as ofile: for i, line in tqdm(enumerate(ifile)): if line != "\n": # sent_list = nltk.tokenize.sent_tokenize(line) sent_list = tokenizer.tokenize(line) for sent in sent_list: ofile.write(sent + "\n") ofile.write(doc_seperator)
from nltk import ne_chunk,pos_tag from nltk.tokenize.punkt import PunktSentenceTokenizer from nltk.tokenize.treebank import TreebankWordTokenizer ''' import nltk nltk.download('words') nltk.download('punkt') nltk.download('maxent_treebank_pos_tagger') nltk.download('maxent_ne_chunker') ''' TreeBankTokenizer = TreebankWordTokenizer() PunktTokenizer = PunktSentenceTokenizer() text = ''' The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division of the Eastern Conference. Founded in 1946, the team is currently owned by Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden, which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL. The Celtics have dominated the league during the late 50's and through the mid 80's, with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek, Larry Bird and legendary Celtics coach Red Auerbach, combined for a 795 - 397 record that helped the Celtics win 16 Championships. ''' sentences = PunktTokenizer.tokenize(text) tokens = [TreeBankTokenizer.tokenize(sentence) for sentence in sentences] tagged = [pos_tag(token) for token in tokens] chunked = [ne_chunk(taggedToken) for taggedToken in tagged]