def preprocess(text): text = feature_reduction.process_tweet(text) text = feature_reduction.remove_punctuation(text) text = feature_reduction.remove_emoticon(text) text = feature_reduction.replaceTwoOrMore(text) text = preprocessing.stemming(text) return text
def toMerge(term,term_0): term=term.split(' ') term_0=term_0.split(' ') if len(term)==1 and len(term_0)==1: #unigram term with same word stem if pre.stemming(term[0])==pre.stemming(term_0[0]): return 'same_stem' else: return False elif contain(term,term_0):#term contains term_0 return 'contain' elif contain(term_0,term): #term contained by term_0 return 'be_contained' elif intersect(term,term_0): #term intersects with term_0, at least half of the longer term is same return intersect(term,term_0) else: return False
def scrape (visited,vocab) : i=0 for url in visited: try: response = urlopen("http://"+url) print("scraping",i) i+=1 except: continue base = [urlparse(u).netloc for u in url] bs = BeautifulSoup(response,'html.parser') try: title = bs.find('title').text except : continue tags=['p','span','h1','h2','h3','h4','h5','h6','div'] if(title): content=title else: content='' for tag in tags: text_tag=bs.find_all(tag) textContent=[x.text for x in text_tag] content +=' '.join(textContent) page_content[url] = {'data':content} content = re.sub('\n',' ',content) tokens = process.tokenizer_fun(content) cleaned = process.remove_stopwords(tokens) stemmed = process.stemming(cleaned) cleaned2 = process.remove_stopwords(stemmed) cleaned_text = process.length2(cleaned2) word_count[url] = {} v_flag = True for token in cleaned_text: if token not in vocab: vocab[token] = 1 elif v_flag: vocab[token] += 1 v_flag = False if token in word_count[url].keys(): word_count[url][token] += 1 else: word_count[url][token] = 1 links = [urljoin(url, l.get('href')) for l in bs.findAll('a')] links = [l.rstrip("/") for l in links if urlparse(l).netloc in base] finalData = (url,cleaned_text,list(set(links))) if finalData != (-1) : crawler_tuple[url] = finalData return crawler_tuple
def toMerge(name,name_0): term=name.split(' ') term_0=name_0.split(' ') if len(term)==1 and len(term_0)==1: #unigram term with same word stem if pre.stemming(term[0])==pre.stemming(term_0[0]): #print '>> MAY MERGE %s AND %s DUE TO SAME WORD-STEM'%(name,name_0) return name else: return False elif contain(term,term_0):#term contains term_0 #print '>> MAY MERGE %s AND %s DUE TO INCLUSION'%(name,name_0) return name elif contain(term_0,term): #term contained by term_0 #print '>> MAY MERGE %s AND %s DUE TO INCLUSION'%(name,name_0) return name_0 elif intersect(term,term_0): #term intersects with term_0, at least half of the longer term is same #print '>> MAY MERGE %s AND %s DUE TO INTERSECTION'%(name,name_0) return intersect(term,term_0) else: return False
def step1a(data): for i in range(data.shape[0]): neg = data.iloc[i, 3] if len(neg) == 0: q.append([]) continue else: list_effected_words = [] for neg_item in neg: for w in neg_item['effectedWords']: list_effected_words.extend( w['words'].encode('ascii').split(' ')) list_effected_words = map(str.lower, list_effected_words) list_effected_words = list(set(list_effected_words)) list_effected_words = pp.lemmatization(list_effected_words) list_effected_words = pp.stemming(list_effected_words) q.append(list_effected_words)
def step1(data): for i in range(data.shape[0]): sen = data.iloc[i, 1] neg = data.iloc[i, 3] if len(neg) == 0: continue elif neg[0]['negex'].find(' no ') != -1: list_neg = neg[0]['negex'].split(' ')[1:] list_neg = pp.lemmatization(list_neg) list_neg = pp.stemming(list_neg) for n in list_neg: index = sen.find(n) if index == -1: raise else: new_sen = sen[:index + len(n)] + '_NEG' + sen[index + len(n):] data.iloc[i, 1] = new_sen print '-' * 10
def test_stemming(self): self.assertEqual(prep.stemming("playing running sleeping"),"play run sleep")
def process_query(query): cleaned_query = pro.tokenizer_fun(query) cleaned_query = pro.remove_stopwords(cleaned_query) cleaned_query = pro.stemming(cleaned_query) return cleaned_query
# Read File sentencesAlign, goldAnnotation = readFile.dataAlign(fileName) # Preprcessing Gold Anotation goldAnnotation = preprocessing.deleteNull(goldAnnotation) listAnnotation = preprocessing.cleanGoldAnno(goldAnnotation) for i in range(len(sentencesAlign)): sentence1 = sentencesAlign[i][0] sentence2 = sentencesAlign[i][1] # Preprocessing sentenceToken1 = preprocessing.tokenize(sentence1) sentenceToken2 = preprocessing.tokenize(sentence2) sentenceLemma1 = preprocessing.stemming(sentenceToken1) sentenceLemma2 = preprocessing.stemming(sentenceToken2) # Aligner cekidentical = aligner.alignIdenticalWords(sentenceLemma1, sentenceLemma2) cekSimilar = aligner.alignWordSimilarity(sentenceLemma1, sentenceLemma2, ppdbDict) cekSequences = aligner.alignSequences(sentenceLemma1, sentenceLemma2) cekneighbor = aligner.alignTextContext(sentenceLemma1, sentenceLemma2, ppdbDict) cekstop = aligner.alignStop(sentenceLemma1, sentenceLemma2, ppdbDict) cekSimilarex = aligner.alignWordSimilarityex(sentenceLemma1, sentenceLemma2, ppdbDictex) # cekDep = aligner.alignDepContext(sentenceLemma1, sentenceLemma2) # Aligner result - Fitur yang akan dimasukkan ke hasil akhir alignment
import numpy as np import os import pickle if __name__ == "__main__": args = config.get_args() # make directory utils.make_directory_doc(args) # load dataset abstract, label = preprocessing.load_dataset(args) # stemming process. we used Snowball stemming of nltk package. abstract = preprocessing.stemming(abstract) # convert word text to idx. sequences, word2idx, vocab_size, instances = preprocessing.get_sequences( abstract, args) # get context words, target word and document idx context, target, document = preprocessing.get_trainable_data( sequences, instances, args) num_document = np.max(document) + 1 # model load and compile model = doc2vec.load_model(args, vocab_size, args.embedding_size, num_document)
#Ambil Teks Dan Kelas Array 2D dari Data Original readOriginal = openFile("komentar.csv") teksDanKelas = teksDanKelas(readOriginal) text = teksDanKelas[0] kelasOrigin = teksDanKelas[1] kelasOriginint = list(map(int,kelasOrigin)) textToken = pre.Tokensisasi(text) preprocessing = pre.Tokensisasi(openTeks) lower = pre.lowerCase(preprocessing) filtering = pre.filtering(lower,stop) stemming = pre.stemming(filtering) term = pre.term(stemming) count_term = len(term) raw = weighting.rawWeighting(stemming,term) gabung = zip(kelas1,raw) gabung1 = zip(kelas1,raw) valueKelas1 = [] valueKelas0 = [] for i in gabung1: if i[0]==0: valueKelas0.append(i[1]) else: valueKelas1.append(i[1])
def training_change_phrase(corpus): BAD = [ "suffer", "adverse", "hazards", "risk", "death", "insufficient", "infection", "recurrence", "restlessness", "mortality", "hazard", "chronic", "pain", "negative", "severity", "complication", "risk", "adverse", "mortality", "morbidity", "death", "fatal", "danger", "no benefit", "discourage", "short-term risk", "long-term risk", "damage", "little information", "not been well studies", "ineffective", "suffer", "depression", "acute", "sore", "outpatient", "disabling", "diabetes", "difficulties", "dysfunction", "distorted", "poorer", "unable", "prolonged", "irritation", "disruptive", "pathological", "mutations", "disease", "infection", "harms", "difficulty", "weakened", "inactive", "stressors", "hypertension", "adverse", "insomnia", "relapsing", "malignant", "suffer", "exacerbate", "dryness", "fever", "overestimate", "constipation", "deposition", "colic", "tension", "hazards", "diarrhoea", "weakness", "irritability", "insidious", "distress", "weak", "cancer", "emergency", "risk", "block", "unsatisfactory ", "blinding", "nausea", "traumatic", "wound", "intention", "loses", "intensive", "relapse", "recurrent", "extension", "die", "cancers", "malaise", "crying", "toxic", "injury", "confounding", "complaints", "misuse", "insignificant", "poisoning", "anoxic", "amputation", "death", "nightmares", "deteriorate", "fatal", "injuries", "fatigue", "invasive", "suicide", "chronic", "relapsed", "disturbances", "confusion", "died", "fluctuating", "severities", "delusions", "compulsions", "conflict", "trauma", "cried", "impair", "severe", "tremor", "weaker", "illness", "inpatients", "worry", "rebound", "worse", "reversible", "dizziness", "attacks", "pointless", "disorders", "dyskinesia", "risks", "fatty", "negative", "conflicting", "upset", "fishy", "hard", "harm", "bleeding", "inflammatory", "hampered", "underpowered", "obstruction", "headache", "problem", "bleeds", "panic", "loss", "odds", "retardation", "dysfunctional", "render", "difficult", "drowsiness", "lack", "suicidal", "obsessions", "impaired", "cough", "severity", "suffering", "violent", "strokes", "virus", "stroke", "flatulence", "fibrates", "blind ", "burning ", "faintness", "suffered", "threatening", "misdiagnosing", "bitter", "excessive", "diabetics", "malfunction", "abnormal", "deterioration", "bad", "confounded", "sadness", "mortality", "disturbance", "agitated", "attack", "infections", "negativistic", "deaths", "poor", "wrong", "worsening", "adversely", "insufficient", "scarring", "headaches", "disability", "overdose ", "serious", "delayed", "discomfort", "sweating", "morbidity", "nerve", "parkinson", "toxicity", "nervous", "pain", "stress", "weakens", "incorrect", "disorder", "worsened", "malformations", "blinded", "rigidity", "prolong", "adversity", "abuse", "lacked", "dyspepsia", "sads ", "onset", "failure", "inadequate", "sensitivity", "impairment", "dementia", "harmful" ] GOOD = [ "benefit", "improvement", "advantage", "accuracy", "great", "effective", "support", "potential", "superior", "mild", "achieved", "Supplementation", "beneficial", "positive", "benefit", "beneficial", "improve", "advantage", "resolve", "good", "fantastic", "relief", "superior", "efficacious", "effective", "improve effectiveness", "importance of protecting", "significant advantage", "significant therapeutic advantage", "may be effective", "effective approach", "simple and effective", "simple and effective treatment", "safe", "well tolerated", "well-tolerated", "useful", "maybe useful", "illustrate the benefits", "significant improvement", "significantly improve", "clinically worthwhile", "worthwhile", "recover rapid", "satisfactory outcome", "satisfactory", "similarly effective", "supports", "approve", "more effective", "high efficacy", "cured", "vitality", "relaxing", "benefit", "tolerability", "improvement", "right", "effective", "stable", "best", "better", "pleasurable", "relaxation", "favour", "beneficial", "safety", "prevents", "successful", "satisfaction", "significant", "superior", "contributions", "reliability", "robust", "tolerated", "improving", "survival", "favourable", "reliable", "recovered", "judiciously", "consciousness", "efficacy", "prevented", "satisfied", "prevent", "advantage", "encouraging", "tolerance", "success", "significance", "improved", "improves", "improve", "improvements" ] MORE = [ "enhance", "higher", "exceed", "increase", "improve", "somewhat", "quite", "very", "higher", "more", "augments", "highest", "enhance", "augment", "increase", "amplify", "raise", "boost", "add to", "higher", "exceed", "rise", "go up", "surpass", "more", "additional", "extra", "added", "greater", "positive", "high", "prolonged", "prolong", "increase", "enhance", "elevation", "higher", "exceed", "enhancement", "peaked", "more", "excess" ] LESS = [ "reduce", "decline", "fall", "less", "little", "slightly", "only", "mildly", "smaller", "lower", "reduction", "drop", "fewer", "slump", "fall", "down", "pummel", "less", "lower", "low", "decrease", "reduce", "decline", "descend", "collapse", "fail", "subside", "lesser", "poorer", "Worse", "smaller", "negative", "prevent", "reduced", "prevents", "below", "lower", "decrease", "fall", "low", "reduce", "decline", "less", "little", "mild", "drop", "fewer" ] BAD = map(str.lower, BAD) GOOD = map(str.lower, GOOD) MORE = map(str.lower, MORE) LESS = map(str.lower, LESS) BAD = pp.stemming(pp.lemmatization(BAD)) GOOD = pp.stemming(pp.lemmatization(GOOD)) MORE = pp.stemming(pp.lemmatization(MORE)) LESS = pp.stemming(pp.lemmatization(LESS)) #print 'len change phrase: '+str(len(BAD)+len(GOOD)+len(MORE)+len(LESS)) def sen2vec(sen): words = sen.split(' ') vecs = [0, 0, 0, 0] #MORE GOOD, MORE BAD, LESS GOOD, LESS BAD for i in range(len(words)): if words[i] in MORE: print 'more=' + words[i] for k in range(i, len(words)): if words[k] in GOOD: print 'good=' + words[k] vecs[0] = 1 break if words[k] in BAD: print 'bad=' + words[k] vecs[1] = 1 break elif words[i] in LESS: for k in range(i, len(words)): if words[k] in GOOD: vecs[2] = 1 break if words[k] in BAD: vecs[3] = 1 break return vecs result = [sen2vec(sen) for sen in corpus] return result