def removeEmphasis(sentence): sentence = sentence.split() no_emphasis = [remove_emphasis(x) for x in sentence] return (no_emphasis)
file = path.join('data','nsk_scrape.xlsx') xl = pd.ExcelFile(file) df = xl.parse('Sheet1') df.head() corpus = [] STOPWORDS = set(stopwords.words('greek')) #Επεξεργασία ΓΝΩΜΟΔΟΤΗΣΕΩΝ print(df.shape[0]) for i in range(0, df.shape[0]): subject = re.sub(r"\d+", '', df['Concultatory'][i],flags=re.I) subject = re.sub(r"[-,()/@\'?\.$%_+\d]", '', df['Concultatory'][i],flags=re.I) stemmer = gr_stemm.GreekStemmer() subject = subject.split() subject = [remove_emphasis(x) for x in subject] subject = [x.upper() for x in subject] subject = [stemmer.stem(word) for word in subject if not word in STOPWORDS and len(word)>=3] subject = [x.lower() for x in subject] subject = " ".join(subject) corpus.append(subject) #words_ = word_tokenize(subject) corpus=pd.DataFrame(corpus, columns=['Concultatory']) corpus.head() result = corpus.join(df[['Status']]) result.groupby(['Status']).size() result.head()
def removeEmphasis(word): word = word.split() no_emphasis = [remove_emphasis(x) for x in word] return (no_emphasis)