コード例 #1
0
def file_processing(file,root,stop_words):
    p = PorterStemmer()
    with open(file) as f:
        length = len(f.readlines())-1
    bar = IncrementalBar('In progress', max=length)

    with open(file, 'r') as csvFile:

        reader = csv.reader(csvFile)
        next(reader)

        for row ,i in zip(reader,range(1,length+1)):
            if not os.path.exists(root+row[1]):
                os.mkdir(root+row[1])

            # Remove stop words first
            example = row[0]
            word_tokens = word_tokenize(example)

            filtered_sentence = [w for w in word_tokens if not w in stop_words]
            joined_sentence = (" ").join(filtered_sentence)+'\n'

            # Do stemming

            output = ''
            word = ''
            line = joined_sentence
            if line == '':
                break
            for c in line:

                if c.isalpha():
                    word += c.lower()
                else:
                    if word:
                        output += p.stem(word, 0, len(word) - 1)
                        word = ''
                    output += c.lower()


            path = root+row[1]+'/'+row[2]+'.txt'
            with open(path, "w") as cursor:

                # Write file
                cursor.write(output)

            bar.next()


        bar.finish()
コード例 #2
0
def file_processing(file,stop_words):
    p = PorterStemmer()
    rows = []

    with open(file, 'r') as csvFile:

        reader = csv.reader(csvFile)
        next(reader)

        for row in reader:
            # Remove stop words first
            example = row[1]
            word_tokens = word_tokenize(example)

            filtered_sentence = [w for w in word_tokens if not w in stop_words]
            joined_sentence = (" ").join(filtered_sentence)+'\n'

            # Do stemming

            output = ''
            word = ''
            line = joined_sentence
            if line == '':
                break
            for c in line:

                if c.isalpha():
                    word += c.lower()
                else:
                    if word:
                        output += p.stem(word, 0, len(word) - 1)
                        word = ''
                    output += c.lower()
            new_row=[]
            new_row.append(output.rstrip('\n'))
            new_row.append('?')
            rows.append(new_row)
    with open("new_test.csv", "w") as csvFile:

        # Write file
        csvwriter = csv.writer(csvFile)

        csvwriter.writerow(['text', 'class'])

        csvwriter.writerows(rows)
コード例 #3
0
ファイル: invert.py プロジェクト: DGGomez/searchEngine
import math
import sys
import re
from stemming import PorterStemmer

p = PorterStemmer()

# control values
stem = False
stopwords = False

stopList = []
wordList = []
numberList = []

docInfo = {}
documentList={}
contextList = {}
frequencyList = {}
locationList = {}
totalList = {}

# check if number to remove numbers and titles
def contains_digits(s):
    return any(char.isdigit() for char in s)

# read doc
def readDoc(doc, context):
    lists = doc.split(" ")
    for w in range(len(lists)):
        i = lists[w]
コード例 #4
0
 def __init__(self, stop_list_filename):
     self.stop_list = map(str.rstrip, open(stop_list_filename, 'r').readlines())  # Removing \n at the end of each word
     self.stemmer = PorterStemmer()