def parseTextToSentences(text): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) data = text data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) return sentences
def get_sentences(filename): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') fp = open(filename) data = fp.read() for i in range(len(prefixes)): data.replace(prefixes[i], prefixes[i][:-1]) sentences = tokenize.sent_tokenize(data) return sentences
def tokenize(src_filename, new_filename): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') fp = open(src_filename, encoding="utf-8") data = fp.read() data = data.replace("\n"," ") with open(new_filename, "w", encoding="utf-8") as new_file: new_file.write("%s" % '\n'.join(tokenizer.tokenize(data)))
def parse_text_to_sentences(text): punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) data = text data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) return sentences
def splitIntoSentences2(file_name): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) fp = open(file_name) data = fp.read() data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) # print '\n-----\n'.join(sentences) return sentences
def clean_corpus(path, lower=True): ''' Clean up a corpus by removing characters and expressions that do not matter at the level of individual sentences path : str A string to a .txt file containing a corpus lower : bool Convert corpus to all lowercase ''' filename_root = os.path.dirname(path) corpus_members = glob.glob(path) corpus = '' # get rid of random line breaks and exclude troublesome expressions like quotes for member in corpus_members: with open(member, "r") as openfile: data = openfile.read() data = data.replace('.', '.') data = data.replace('\'\'', '"') data = data.replace('``', '"') data = data.replace('`', '\'') data = data.replace(',', ',') data = data.replace(';', ';') data = data.replace(':', ':') for badchar in excluded_items1: data = data.replace(badchar, ' ') for badchar in excluded_items2: data = data.replace(badchar, '') corpus = corpus + ' ' + data if lower: corpus = corpus.lower() return corpus
# maximum words contained in a sentence max_length = 100 # 끝에서 부터 내용들이 유실된다 trunc_type = 'post' # 0 을 뒤에서 부터 padding 해준다 padding_type = 'post' oov_tok = "<OOV>" # This data has 27000 records, we are going to use 20000 for training and the other for testing training_size = 20000 # ================================================================================================================================================================== # Open json file and pre-process it with open('Sarcasm_Headlines_Dataset.json','r') as f: data = f.read() data = "[" + data.replace("}", "},", data.count("}")-1) + "]" data_store = json.loads(data) # print(data_store) # Data_store is a group of 'dictionary' # ================================================================================================================================================================== sentences = [] labels = [] # Item is a 'dictionary type' variable for item in data_store: sentences.append(item['headline']) labels.append(item['is_sarcastic']) # ==================================================================================================================================================================
'other', 'we' } time = { 'today', 'yesterday', 'evening', 'afternoon', 'morning', 'tomorrow', 'tonight', 'tonite' 'year', 'day', 'week', 'month', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' } repeat_question = 'y' file = easygui.fileopenbox() with open(file, 'r') as file: data = file.read() data = data.replace('\n', '') tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') data = tokenizer.tokenize(data) subj = [[] for i in range(len(data))] verb = [[] for i in range(len(data))] obj = [[] for i in range(len(data))] tell_time = [[] for i in range(len(data))] for i in range(len(data)): #removing stopwords data[i] = [w for w in data[i].split() if w not in stop_words] #tagging the words sentence = data[i]
document_vector = [] doc_vec = []; file_names ={} j=0 total_sentences = 0 for tempfile in opt: fp = open(tempfile) file_names[tempfile] = j; datas = fp.read() i = 0 paranum = 0 tl = [] for data in datas.splitlines(): data = data.replace('\n','') data = data.replace('\t','') data = data.replace('\r','') sys.stdout.write( "\rProcessing para " + str(paranum)) if len(data) <= 3: continue data = tokenizer.tokenize(data) for sen in data: #print "(" + str(i) + ")" + sen bog = removeStopwords(sen) if len(bog) < 2: continue tl.append(bog); sentence.append(sentenceRepresentation(bog,0,sen,tempfile,i,paranum)) i = i + 1
import nltk.data import nltk import language_check import string import re tool = language_check.LanguageTool('en-US') nltk.download('punkt') tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') lines = [] fp = open("./data/dbt_handouts_pass1.txt") data = fp.read() data = data.replace("\n", ' ').replace('•• ', '').replace('*', '').replace('Other:', '') data = re.sub(r'\d+', ' ', data) data = re.sub(r'[^\x00-\x7f]', r'', data) data = ' '.join(data.split()) for x in tokenizer.tokenize(data): matches = tool.check(x) words = x.split(' ') if len(matches) == 0 and len(words) >= 8: lines.append(x) print('\n'.join(lines)) f = open("./data/dbt_handouts_pass2.txt", "w") f.write('\n'.join(tokenizer.tokenize(data))) f.close()
def AMAZON_grabINFO(url): crumbs = [] crumbed = False review = "" image_url = None # try: try: print "Requesting page..." page = urllib2.urlopen(url) soup = BeautifulSoup(page.read()) except: return (None, None, None, None, None, False) # get images for item_IMAGE in soup.findAll("img", {"id": "landingImage"}): # alt of imgBlkFront image_url = item_IMAGE["src"] print " New Link: " + str(image_url) # Product Review if image_url == None: print "Identified as none type." for item_IMAGE in soup.findAll("img", {"id": "imgBlkFront"}): # alt of imgBlkFront image_url = item_IMAGE["src"] print " New Link: " + str(image_url) item_TITLE = soup.find("span", {"id": "productTitle"}, text=True) if item_TITLE == None: print "No Title!" item_TITLE = "None" else: text = "".join(item_TITLE.findAll(text=True)) item_TITLE = unicode_check(text).strip() print "TITLE: " + item_TITLE # Find Product Info dat_ladder = soup.find(attrs={"class": "zg_hrsr_ladder"}) if dat_ladder is None: print "No soup!" category = "none" else: get_rows = dat_ladder.findAll("a") print "Categories: " for hit in get_rows: text = "".join(hit.findAll(text=True)) data = unicode_check(text).strip() category = data print category crumbs.append(data.replace("&", "and")) # print "Category: " + category review_soup = soup.find("div", attrs={"id": "revMHRL"}) if review_soup == None: print "No Review Soup!" review_soup = "" review = "" else: # Scrape Review test = review_soup.findAll("a") review_url = url for reviews in test: if reviews.has_attr("href"): print "Reviews: " + reviews["href"] review_url = reviews["href"] if "review" in reviews["href"]: break # Get the Review pre_review = get_review(review_url) review = unicode_check(pre_review[:]) print "Review: " + str(unicode_check(review)) pre_review2 = split_sentences(review) print "Pre_Review2: " + str(pre_review2) review = "" # Make the Review for each in pre_review2: sentiment = "" if len(review) < 500: sentiment = get_setiment(each) print "Sentiment Detected: " + sentiment if ("Positive" == str(sentiment)) or ("Neutral" == str(sentiment)) or ("positive" == str(sentiment)): print "Sentiment Found: " + sentiment review2 = review + each.replace("&", "and") + " " if len(review2) < 500: review = review2 print "Added sentence: " + each print "Summarized: " + str(review) return (image_url, review, item_TITLE, category, crumbs, False)
) ) ) corpus = [] corpusOriginal = [] with open('eggs.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: data=row[2] text = " ".join( map( stemmer2.stem, filter( fr_stop, stupid_tokenizer(data.replace("é", "e").replace("è", "e").replace("â", "a") .replace("ê", "e").replace("ù","u").replace("û","u") .replace("ë","e").replace("ü","u").replace("à","a").replace(","," ")) ) ) ) corpusOriginal.append(data) corpus.append(text) <<<<<<< HEAD corpus_train, corpus_test = train_test_split( corpus, test_size=0.80, random_state=42) corpusOriginal_train,corpusOriginal_test = train_test_split( corpusOriginal, test_size=0.80, random_state=42) vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=2200) X = vectorizer.fit_transform(corpus_train) pca = PCA(n_components=100) X_pca = pca.fit_transform(X.toarray()) svd = TruncatedSVD(n_components=50, n_iter=50, random_state=42) =======
def makeLowestUnknown(data,unigramDf): replaces = unigramDf.sort_values(by="Probability",ascending=False).Text.values[-3:] newData = data.replace(replaces[0],"UNK").replace(replaces[1],"UNK").replace(replaces[2],"UNK") return newData
document_vector = [] doc_vec = [] file_names = {} j = 0 total_sentences = 0 for tempfile in opt: fp = open(tempfile) file_names[tempfile] = j datas = fp.read() i = 0 paranum = 0 tl = [] for data in datas.splitlines(): data = data.replace('\n', '') data = data.replace('\t', '') data = data.replace('\r', '') sys.stdout.write("\rProcessing para " + str(paranum)) if len(data) <= 3: continue data = tokenizer.tokenize(data) for sen in data: #print "(" + str(i) + ")" + sen bog = removeStopwords(sen) if len(bog) < 2: continue tl.append(bog) sentence.append( sentenceRepresentation(bog, 0, sen, tempfile, i, paranum))