Esempio n. 1
1
def parseTextToSentences(text):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    data = text
    data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

    sentences = []
    for para in data.split('\n'):
        if para:
            sentences.extend(sentence_splitter.tokenize(para))
    return sentences
def get_sentences(filename):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open(filename)
    data = fp.read()
    for i in range(len(prefixes)):
        data.replace(prefixes[i], prefixes[i][:-1])
    sentences = tokenize.sent_tokenize(data)
    return sentences
Esempio n. 3
0
def tokenize(src_filename, new_filename):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open(src_filename, encoding="utf-8")
    data = fp.read()
    data = data.replace("\n"," ")
    with open(new_filename, "w", encoding="utf-8") as new_file:
        new_file.write("%s" % '\n'.join(tokenizer.tokenize(data)))
def parse_text_to_sentences(text):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    data = text
    data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

    sentences = []
    for para in data.split('\n'):
        if para:
            sentences.extend(sentence_splitter.tokenize(para))
    return sentences
Esempio n. 5
0
def splitIntoSentences2(file_name):
  punkt_param = PunktParameters()
  punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
  sentence_splitter = PunktSentenceTokenizer(punkt_param)
  fp = open(file_name)
  data = fp.read()
  data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

  sentences = []
  for para in data.split('\n'):
    if para:
      sentences.extend(sentence_splitter.tokenize(para))
  # print '\n-----\n'.join(sentences)
  return sentences
Esempio n. 6
0
def clean_corpus(path, lower=True):
    '''
    Clean up a corpus by removing characters and expressions that 
    do not matter at the level of individual sentences
    
    path : str
        A string to a .txt file containing a corpus
        
    lower : bool
        Convert corpus to all lowercase
    
    '''
    filename_root = os.path.dirname(path)
    corpus_members = glob.glob(path)
    corpus = ''

    # get rid of random line breaks and exclude troublesome expressions like quotes
    for member in corpus_members:
        with open(member, "r") as openfile:
            data = openfile.read()
            data = data.replace('.', '.')
            data = data.replace('\'\'', '"')
            data = data.replace('``', '"')
            data = data.replace('`', '\'')
            data = data.replace(',', ',')
            data = data.replace(';', ';')
            data = data.replace(':', ':')
            for badchar in excluded_items1:
                data = data.replace(badchar, ' ')
            for badchar in excluded_items2:
                data = data.replace(badchar, '')
        corpus = corpus + ' ' + data

    if lower:
        corpus = corpus.lower()

    return corpus
Esempio n. 7
0
# maximum words contained in a sentence
max_length    = 100
# 끝에서 부터 내용들이 유실된다
trunc_type    = 'post'
# 0 을 뒤에서 부터 padding 해준다
padding_type  = 'post'
oov_tok       = "<OOV>"
# This data has 27000 records, we are going to use 20000 for training and the other for testing
training_size = 20000

# ==================================================================================================================================================================
# Open json file and pre-process it
with open('Sarcasm_Headlines_Dataset.json','r') as f:
    data = f.read()

data = "[" + data.replace("}", "},", data.count("}")-1) + "]"
data_store = json.loads(data)

# print(data_store)
# Data_store is a group of 'dictionary'

# ==================================================================================================================================================================
sentences = []
labels    = []

# Item is a 'dictionary type' variable
for item in data_store:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

# ==================================================================================================================================================================
Esempio n. 8
0
    'other', 'we'
}
time = {
    'today', 'yesterday', 'evening', 'afternoon', 'morning', 'tomorrow',
    'tonight', 'tonite'
    'year', 'day', 'week', 'month', 'monday', 'tuesday', 'wednesday',
    'thursday', 'friday', 'saturday', 'sunday', 'Monday', 'Tuesday',
    'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
}

repeat_question = 'y'
file = easygui.fileopenbox()
with open(file, 'r') as file:
    data = file.read()

data = data.replace('\n', '')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data = tokenizer.tokenize(data)

subj = [[] for i in range(len(data))]
verb = [[] for i in range(len(data))]
obj = [[] for i in range(len(data))]
tell_time = [[] for i in range(len(data))]

for i in range(len(data)):

    #removing stopwords
    data[i] = [w for w in data[i].split() if w not in stop_words]

    #tagging the words
    sentence = data[i]
Esempio n. 9
0
document_vector = []
doc_vec = [];
file_names ={}

j=0
total_sentences = 0
for tempfile in opt:
	fp = open(tempfile)
	file_names[tempfile] = j;
	datas = fp.read()
	i = 0
	paranum = 0
	tl = []
	for data in datas.splitlines():

		data = data.replace('\n','')
		data = data.replace('\t','')
		data = data.replace('\r','')
		sys.stdout.write( "\rProcessing para " + str(paranum))
		if len(data) <= 3:
			continue
		data = tokenizer.tokenize(data)

		for sen in data:
			#print "(" + str(i) + ")" + sen
			bog = removeStopwords(sen)
			if len(bog) < 2:
				continue
			tl.append(bog);
			sentence.append(sentenceRepresentation(bog,0,sen,tempfile,i,paranum))
			i = i + 1
Esempio n. 10
0
import nltk.data
import nltk
import language_check
import string
import re

tool = language_check.LanguageTool('en-US')
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

lines = []
fp = open("./data/dbt_handouts_pass1.txt")
data = fp.read()
data = data.replace("\n", ' ').replace('•• ',
                                       '').replace('*',
                                                   '').replace('Other:', '')
data = re.sub(r'\d+', ' ', data)
data = re.sub(r'[^\x00-\x7f]', r'', data)
data = ' '.join(data.split())

for x in tokenizer.tokenize(data):
    matches = tool.check(x)
    words = x.split(' ')
    if len(matches) == 0 and len(words) >= 8:
        lines.append(x)

print('\n'.join(lines))
f = open("./data/dbt_handouts_pass2.txt", "w")
f.write('\n'.join(tokenizer.tokenize(data)))
f.close()
def AMAZON_grabINFO(url):
    crumbs = []
    crumbed = False
    review = ""
    image_url = None
    # try:
    try:
        print "Requesting page..."
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page.read())
    except:
        return (None, None, None, None, None, False)

        # get images
    for item_IMAGE in soup.findAll("img", {"id": "landingImage"}):  # alt of imgBlkFront
        image_url = item_IMAGE["src"]
        print " New Link: " + str(image_url)
        # Product Review

    if image_url == None:
        print "Identified as none type."
        for item_IMAGE in soup.findAll("img", {"id": "imgBlkFront"}):  # alt of imgBlkFront
            image_url = item_IMAGE["src"]
            print " New Link: " + str(image_url)

    item_TITLE = soup.find("span", {"id": "productTitle"}, text=True)
    if item_TITLE == None:
        print "No Title!"
        item_TITLE = "None"
    else:
        text = "".join(item_TITLE.findAll(text=True))
        item_TITLE = unicode_check(text).strip()
    print "TITLE: " + item_TITLE

    # Find Product Info
    dat_ladder = soup.find(attrs={"class": "zg_hrsr_ladder"})
    if dat_ladder is None:
        print "No soup!"
        category = "none"
    else:
        get_rows = dat_ladder.findAll("a")
        print "Categories: "

        for hit in get_rows:
            text = "".join(hit.findAll(text=True))
            data = unicode_check(text).strip()
            category = data
            print category
            crumbs.append(data.replace("&", "and"))

            # print "Category: " + category
    review_soup = soup.find("div", attrs={"id": "revMHRL"})
    if review_soup == None:
        print "No Review Soup!"
        review_soup = ""
        review = ""
    else:
        # Scrape Review
        test = review_soup.findAll("a")
        review_url = url
        for reviews in test:
            if reviews.has_attr("href"):
                print "Reviews: " + reviews["href"]
                review_url = reviews["href"]
                if "review" in reviews["href"]:
                    break

                    # Get the Review
        pre_review = get_review(review_url)
        review = unicode_check(pre_review[:])
        print "Review: " + str(unicode_check(review))
        pre_review2 = split_sentences(review)
        print "Pre_Review2: " + str(pre_review2)

        review = ""

        # Make the Review
        for each in pre_review2:
            sentiment = ""
            if len(review) < 500:
                sentiment = get_setiment(each)
                print "Sentiment Detected: " + sentiment

                if ("Positive" == str(sentiment)) or ("Neutral" == str(sentiment)) or ("positive" == str(sentiment)):
                    print "Sentiment Found: " + sentiment
                    review2 = review + each.replace("&", "and") + " "
                    if len(review2) < 500:
                        review = review2
                        print "Added sentence: " + each

    print "Summarized: " + str(review)
    return (image_url, review, item_TITLE, category, crumbs, False)
Esempio n. 12
0
        )
    )
)

corpus = []
corpusOriginal = []
with open('eggs.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        data=row[2]
        text = " ".join(
            map(
                stemmer2.stem, filter(
                    fr_stop,
                    stupid_tokenizer(data.replace("é", "e").replace("è", "e").replace("â", "a")
                                     .replace("ê", "e").replace("ù","u").replace("û","u")
                                     .replace("ë","e").replace("ü","u").replace("à","a").replace(","," "))
                )
            )
        )
        corpusOriginal.append(data)
        corpus.append(text)
<<<<<<< HEAD
corpus_train, corpus_test = train_test_split( corpus, test_size=0.80, random_state=42)
corpusOriginal_train,corpusOriginal_test = train_test_split( corpusOriginal, test_size=0.80, random_state=42)
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=2200)
X = vectorizer.fit_transform(corpus_train)
pca = PCA(n_components=100)
X_pca = pca.fit_transform(X.toarray())
svd = TruncatedSVD(n_components=50, n_iter=50, random_state=42)
=======
Esempio n. 13
0
def makeLowestUnknown(data,unigramDf):
    replaces = unigramDf.sort_values(by="Probability",ascending=False).Text.values[-3:]
    newData = data.replace(replaces[0],"UNK").replace(replaces[1],"UNK").replace(replaces[2],"UNK")
    return newData
Esempio n. 14
0
document_vector = []
doc_vec = []
file_names = {}

j = 0
total_sentences = 0
for tempfile in opt:
    fp = open(tempfile)
    file_names[tempfile] = j
    datas = fp.read()
    i = 0
    paranum = 0
    tl = []
    for data in datas.splitlines():

        data = data.replace('\n', '')
        data = data.replace('\t', '')
        data = data.replace('\r', '')
        sys.stdout.write("\rProcessing para " + str(paranum))
        if len(data) <= 3:
            continue
        data = tokenizer.tokenize(data)

        for sen in data:
            #print "(" + str(i) + ")" + sen
            bog = removeStopwords(sen)
            if len(bog) < 2:
                continue
            tl.append(bog)
            sentence.append(
                sentenceRepresentation(bog, 0, sen, tempfile, i, paranum))