def preprocess(text):
    text = feature_reduction.process_tweet(text)
    text = feature_reduction.remove_punctuation(text)
    text = feature_reduction.remove_emoticon(text)
    text = feature_reduction.replaceTwoOrMore(text)
    text = preprocessing.stemming(text)
    return text
Esempio n. 2
0
def toMerge(term,term_0):
	term=term.split(' ')
	term_0=term_0.split(' ')
	if len(term)==1 and len(term_0)==1: #unigram term with same word stem 
		if pre.stemming(term[0])==pre.stemming(term_0[0]):
			return 'same_stem'
		else:
			return False
	elif contain(term,term_0):#term contains term_0
		return 'contain'
	elif contain(term_0,term): #term contained by term_0
		return 'be_contained'		
	elif intersect(term,term_0): 
	#term intersects with term_0, at least half of the longer term is same		
		return intersect(term,term_0)	
	else:
		return False
Esempio n. 3
0
def scrape (visited,vocab) :
    i=0
    for url in visited:
        try:
            response = urlopen("http://"+url)
            print("scraping",i)
            i+=1
        except:
            continue
        base = [urlparse(u).netloc for u in url]    
        bs = BeautifulSoup(response,'html.parser')
        try:
            title = bs.find('title').text
        except :
            continue
        tags=['p','span','h1','h2','h3','h4','h5','h6','div']
        if(title):
            content=title
        else:
            content=''
        for tag in tags:
            text_tag=bs.find_all(tag)
            textContent=[x.text for x in text_tag]
            content +=' '.join(textContent)
        
        page_content[url] = {'data':content} 
        content = re.sub('\n',' ',content)

        tokens = process.tokenizer_fun(content)
        cleaned = process.remove_stopwords(tokens)
        stemmed = process.stemming(cleaned)
        cleaned2 = process.remove_stopwords(stemmed)
        cleaned_text = process.length2(cleaned2)

        word_count[url] = {}
        v_flag = True
        for token in cleaned_text:
            
            if token not in vocab:
                vocab[token] = 1
            elif v_flag:
                vocab[token] += 1
                v_flag = False
                
            if token in word_count[url].keys():
                word_count[url][token] += 1
            else:
                word_count[url][token] = 1
                
        links = [urljoin(url, l.get('href')) for l in bs.findAll('a')]
        links = [l.rstrip("/") for l in links if urlparse(l).netloc in base]
        finalData = (url,cleaned_text,list(set(links)))
        if finalData != (-1) :
            crawler_tuple[url] = finalData

    return crawler_tuple
Esempio n. 4
0
def toMerge(name,name_0):
	term=name.split(' ')
	term_0=name_0.split(' ')
	if len(term)==1 and len(term_0)==1: #unigram term with same word stem 
		if pre.stemming(term[0])==pre.stemming(term_0[0]):
			#print '>> MAY MERGE %s AND %s DUE TO SAME WORD-STEM'%(name,name_0)
			return name
		else:
			return False
	elif contain(term,term_0):#term contains term_0
		#print '>> MAY MERGE %s AND %s DUE TO INCLUSION'%(name,name_0)
		return name
	elif contain(term_0,term): #term contained by term_0
		#print '>> MAY MERGE %s AND %s DUE TO INCLUSION'%(name,name_0)
		return name_0
	elif intersect(term,term_0): 
	#term intersects with term_0, at least half of the longer term is same	
		#print '>> MAY MERGE %s AND %s DUE TO INTERSECTION'%(name,name_0)
		return intersect(term,term_0)	
	else:
		return False
Esempio n. 5
0
def step1a(data):
    for i in range(data.shape[0]):
        neg = data.iloc[i, 3]

        if len(neg) == 0:
            q.append([])
            continue
        else:
            list_effected_words = []
            for neg_item in neg:
                for w in neg_item['effectedWords']:
                    list_effected_words.extend(
                        w['words'].encode('ascii').split(' '))
            list_effected_words = map(str.lower, list_effected_words)
            list_effected_words = list(set(list_effected_words))
            list_effected_words = pp.lemmatization(list_effected_words)
            list_effected_words = pp.stemming(list_effected_words)
            q.append(list_effected_words)
Esempio n. 6
0
def step1(data):
    for i in range(data.shape[0]):

        sen = data.iloc[i, 1]
        neg = data.iloc[i, 3]

        if len(neg) == 0:
            continue
        elif neg[0]['negex'].find(' no ') != -1:
            list_neg = neg[0]['negex'].split(' ')[1:]
            list_neg = pp.lemmatization(list_neg)
            list_neg = pp.stemming(list_neg)
            for n in list_neg:
                index = sen.find(n)
                if index == -1:
                    raise
                else:
                    new_sen = sen[:index + len(n)] + '_NEG' + sen[index +
                                                                  len(n):]
                    data.iloc[i, 1] = new_sen
                    print '-' * 10
Esempio n. 7
0
 def test_stemming(self):
     self.assertEqual(prep.stemming("playing running sleeping"),"play run sleep")
def process_query(query):
    cleaned_query = pro.tokenizer_fun(query)
    cleaned_query = pro.remove_stopwords(cleaned_query)
    cleaned_query = pro.stemming(cleaned_query)
    return cleaned_query
# Read File
sentencesAlign, goldAnnotation = readFile.dataAlign(fileName)

# Preprcessing Gold Anotation
goldAnnotation = preprocessing.deleteNull(goldAnnotation)
listAnnotation = preprocessing.cleanGoldAnno(goldAnnotation)

for i in range(len(sentencesAlign)):
    sentence1 = sentencesAlign[i][0]
    sentence2 = sentencesAlign[i][1]

    # Preprocessing
    sentenceToken1 = preprocessing.tokenize(sentence1)
    sentenceToken2 = preprocessing.tokenize(sentence2)

    sentenceLemma1 = preprocessing.stemming(sentenceToken1)
    sentenceLemma2 = preprocessing.stemming(sentenceToken2)

    # Aligner
    cekidentical = aligner.alignIdenticalWords(sentenceLemma1, sentenceLemma2)
    cekSimilar = aligner.alignWordSimilarity(sentenceLemma1, sentenceLemma2,
                                             ppdbDict)
    cekSequences = aligner.alignSequences(sentenceLemma1, sentenceLemma2)
    cekneighbor = aligner.alignTextContext(sentenceLemma1, sentenceLemma2,
                                           ppdbDict)
    cekstop = aligner.alignStop(sentenceLemma1, sentenceLemma2, ppdbDict)
    cekSimilarex = aligner.alignWordSimilarityex(sentenceLemma1,
                                                 sentenceLemma2, ppdbDictex)
    # cekDep = aligner.alignDepContext(sentenceLemma1, sentenceLemma2)

    # Aligner result - Fitur yang akan dimasukkan ke hasil akhir alignment
Esempio n. 10
0
import numpy as np
import os
import pickle

if __name__ == "__main__":

    args = config.get_args()

    # make directory
    utils.make_directory_doc(args)

    # load dataset
    abstract, label = preprocessing.load_dataset(args)

    # stemming process. we used Snowball stemming of nltk package.
    abstract = preprocessing.stemming(abstract)

    # convert word text to idx.
    sequences, word2idx, vocab_size, instances = preprocessing.get_sequences(
        abstract, args)

    # get context words, target word and document idx
    context, target, document = preprocessing.get_trainable_data(
        sequences, instances, args)

    num_document = np.max(document) + 1

    # model load and compile
    model = doc2vec.load_model(args, vocab_size, args.embedding_size,
                               num_document)
Esempio n. 11
0
#Ambil Teks Dan Kelas Array 2D dari Data Original
readOriginal = openFile("komentar.csv")
teksDanKelas = teksDanKelas(readOriginal)

text = teksDanKelas[0]
kelasOrigin = teksDanKelas[1]
kelasOriginint = list(map(int,kelasOrigin))


textToken = pre.Tokensisasi(text)

preprocessing = pre.Tokensisasi(openTeks)
lower = pre.lowerCase(preprocessing)
filtering = pre.filtering(lower,stop)
stemming = pre.stemming(filtering)
term = pre.term(stemming)
count_term = len(term)

raw = weighting.rawWeighting(stemming,term)

gabung = zip(kelas1,raw)
gabung1 = zip(kelas1,raw)

valueKelas1 = []
valueKelas0 = []
for i in gabung1:
    if i[0]==0:
        valueKelas0.append(i[1])
    else:
        valueKelas1.append(i[1])
Esempio n. 12
0
def training_change_phrase(corpus):
    BAD = [
        "suffer", "adverse", "hazards", "risk", "death", "insufficient",
        "infection", "recurrence", "restlessness", "mortality", "hazard",
        "chronic", "pain", "negative", "severity", "complication", "risk",
        "adverse", "mortality", "morbidity", "death", "fatal", "danger",
        "no benefit", "discourage", "short-term risk", "long-term risk",
        "damage", "little information", "not been well studies", "ineffective",
        "suffer", "depression", "acute", "sore", "outpatient", "disabling",
        "diabetes", "difficulties", "dysfunction", "distorted", "poorer",
        "unable", "prolonged", "irritation", "disruptive", "pathological",
        "mutations", "disease", "infection", "harms", "difficulty", "weakened",
        "inactive", "stressors", "hypertension", "adverse", "insomnia",
        "relapsing", "malignant", "suffer", "exacerbate", "dryness", "fever",
        "overestimate", "constipation", "deposition", "colic", "tension",
        "hazards", "diarrhoea", "weakness", "irritability", "insidious",
        "distress", "weak", "cancer", "emergency", "risk", "block",
        "unsatisfactory ", "blinding", "nausea", "traumatic", "wound",
        "intention", "loses", "intensive", "relapse", "recurrent", "extension",
        "die", "cancers", "malaise", "crying", "toxic", "injury",
        "confounding", "complaints", "misuse", "insignificant", "poisoning",
        "anoxic", "amputation", "death", "nightmares", "deteriorate", "fatal",
        "injuries", "fatigue", "invasive", "suicide", "chronic", "relapsed",
        "disturbances", "confusion", "died", "fluctuating", "severities",
        "delusions", "compulsions", "conflict", "trauma", "cried", "impair",
        "severe", "tremor", "weaker", "illness", "inpatients", "worry",
        "rebound", "worse", "reversible", "dizziness", "attacks", "pointless",
        "disorders", "dyskinesia", "risks", "fatty", "negative", "conflicting",
        "upset", "fishy", "hard", "harm", "bleeding", "inflammatory",
        "hampered", "underpowered", "obstruction", "headache", "problem",
        "bleeds", "panic", "loss", "odds", "retardation", "dysfunctional",
        "render", "difficult", "drowsiness", "lack", "suicidal", "obsessions",
        "impaired", "cough", "severity", "suffering", "violent", "strokes",
        "virus", "stroke", "flatulence", "fibrates", "blind ", "burning ",
        "faintness", "suffered", "threatening", "misdiagnosing", "bitter",
        "excessive", "diabetics", "malfunction", "abnormal", "deterioration",
        "bad", "confounded", "sadness", "mortality", "disturbance", "agitated",
        "attack", "infections", "negativistic", "deaths", "poor", "wrong",
        "worsening", "adversely", "insufficient", "scarring", "headaches",
        "disability", "overdose ", "serious", "delayed", "discomfort",
        "sweating", "morbidity", "nerve", "parkinson", "toxicity", "nervous",
        "pain", "stress", "weakens", "incorrect", "disorder", "worsened",
        "malformations", "blinded", "rigidity", "prolong", "adversity",
        "abuse", "lacked", "dyspepsia", "sads ", "onset", "failure",
        "inadequate", "sensitivity", "impairment", "dementia", "harmful"
    ]
    GOOD = [
        "benefit", "improvement", "advantage", "accuracy", "great",
        "effective", "support", "potential", "superior", "mild", "achieved",
        "Supplementation", "beneficial", "positive", "benefit", "beneficial",
        "improve", "advantage", "resolve", "good", "fantastic", "relief",
        "superior", "efficacious", "effective", "improve effectiveness",
        "importance of protecting", "significant advantage",
        "significant therapeutic advantage", "may be effective",
        "effective approach", "simple and effective",
        "simple and effective treatment", "safe", "well tolerated",
        "well-tolerated", "useful", "maybe useful", "illustrate the benefits",
        "significant improvement", "significantly improve",
        "clinically worthwhile", "worthwhile", "recover rapid",
        "satisfactory outcome", "satisfactory", "similarly effective",
        "supports", "approve", "more effective", "high efficacy", "cured",
        "vitality", "relaxing", "benefit", "tolerability", "improvement",
        "right", "effective", "stable", "best", "better", "pleasurable",
        "relaxation", "favour", "beneficial", "safety", "prevents",
        "successful", "satisfaction", "significant", "superior",
        "contributions", "reliability", "robust", "tolerated", "improving",
        "survival", "favourable", "reliable", "recovered", "judiciously",
        "consciousness", "efficacy", "prevented", "satisfied", "prevent",
        "advantage", "encouraging", "tolerance", "success", "significance",
        "improved", "improves", "improve", "improvements"
    ]
    MORE = [
        "enhance", "higher", "exceed", "increase", "improve", "somewhat",
        "quite", "very", "higher", "more", "augments", "highest", "enhance",
        "augment", "increase", "amplify", "raise", "boost", "add to", "higher",
        "exceed", "rise", "go up", "surpass", "more", "additional", "extra",
        "added", "greater", "positive", "high", "prolonged", "prolong",
        "increase", "enhance", "elevation", "higher", "exceed", "enhancement",
        "peaked", "more", "excess"
    ]
    LESS = [
        "reduce", "decline", "fall", "less", "little", "slightly", "only",
        "mildly", "smaller", "lower", "reduction", "drop", "fewer", "slump",
        "fall", "down", "pummel", "less", "lower", "low", "decrease", "reduce",
        "decline", "descend", "collapse", "fail", "subside", "lesser",
        "poorer", "Worse", "smaller", "negative", "prevent", "reduced",
        "prevents", "below", "lower", "decrease", "fall", "low", "reduce",
        "decline", "less", "little", "mild", "drop", "fewer"
    ]

    BAD = map(str.lower, BAD)
    GOOD = map(str.lower, GOOD)
    MORE = map(str.lower, MORE)
    LESS = map(str.lower, LESS)
    BAD = pp.stemming(pp.lemmatization(BAD))
    GOOD = pp.stemming(pp.lemmatization(GOOD))
    MORE = pp.stemming(pp.lemmatization(MORE))
    LESS = pp.stemming(pp.lemmatization(LESS))

    #print 'len change phrase: '+str(len(BAD)+len(GOOD)+len(MORE)+len(LESS))
    def sen2vec(sen):
        words = sen.split(' ')
        vecs = [0, 0, 0, 0]  #MORE GOOD, MORE BAD, LESS GOOD, LESS BAD
        for i in range(len(words)):
            if words[i] in MORE:
                print 'more=' + words[i]
                for k in range(i, len(words)):
                    if words[k] in GOOD:
                        print 'good=' + words[k]
                        vecs[0] = 1
                        break
                    if words[k] in BAD:
                        print 'bad=' + words[k]
                        vecs[1] = 1
                        break
            elif words[i] in LESS:
                for k in range(i, len(words)):
                    if words[k] in GOOD:
                        vecs[2] = 1
                        break
                    if words[k] in BAD:
                        vecs[3] = 1
                        break
        return vecs

    result = [sen2vec(sen) for sen in corpus]
    return result