Ejemplo n.º 1
0
def file_processing(file,root,stop_words):
    p = PorterStemmer()
    with open(file) as f:
        length = len(f.readlines())-1
    bar = IncrementalBar('In progress', max=length)

    with open(file, 'r') as csvFile:

        reader = csv.reader(csvFile)
        next(reader)

        for row ,i in zip(reader,range(1,length+1)):
            if not os.path.exists(root+row[1]):
                os.mkdir(root+row[1])

            # Remove stop words first
            example = row[0]
            word_tokens = word_tokenize(example)

            filtered_sentence = [w for w in word_tokens if not w in stop_words]
            joined_sentence = (" ").join(filtered_sentence)+'\n'

            # Do stemming

            output = ''
            word = ''
            line = joined_sentence
            if line == '':
                break
            for c in line:

                if c.isalpha():
                    word += c.lower()
                else:
                    if word:
                        output += p.stem(word, 0, len(word) - 1)
                        word = ''
                    output += c.lower()


            path = root+row[1]+'/'+row[2]+'.txt'
            with open(path, "w") as cursor:

                # Write file
                cursor.write(output)

            bar.next()


        bar.finish()
class Process_text:

    def __init__(self, stop_list_filename):
        self.stop_list = map(str.rstrip, open(stop_list_filename, 'r').readlines())  # Removing \n at the end of each word
        self.stemmer = PorterStemmer()

    def _tokenize(self, text):
        return re.findall('\w+', text)

    def _stem(self, words_list):
        return map(lambda x: self.stemmer.stem(x, 0, len(x) - 1), words_list)

    def _remove_common_words(self, words_list):
        # return [word for word in words_list if word not in self.stop_list]

        # From https://gist.github.com/glenbot/4684356
        # 2x as fast... But it's not a one-liner.
        stop_words = set(self.stop_list)
        for sw in stop_words.intersection(words_list):
            occurences = words_list.count(sw)
            for i in xrange(occurences):
                words_list.remove(sw)

        return words_list

    def _word_statistics(self, words_list):
        return Counter(words_list)

    def sanitize_rawtext(self, raw_text):
        return self._stem(self._remove_common_words(self._tokenize(raw_text.lower())))

    def sanitize_rawtext_with_stats(self, raw_text):
        # One-liners FTW
        return self._word_statistics(self.sanitize_rawtext(raw_text))
Ejemplo n.º 3
0
def file_processing(file,stop_words):
    p = PorterStemmer()
    rows = []

    with open(file, 'r') as csvFile:

        reader = csv.reader(csvFile)
        next(reader)

        for row in reader:
            # Remove stop words first
            example = row[1]
            word_tokens = word_tokenize(example)

            filtered_sentence = [w for w in word_tokens if not w in stop_words]
            joined_sentence = (" ").join(filtered_sentence)+'\n'

            # Do stemming

            output = ''
            word = ''
            line = joined_sentence
            if line == '':
                break
            for c in line:

                if c.isalpha():
                    word += c.lower()
                else:
                    if word:
                        output += p.stem(word, 0, len(word) - 1)
                        word = ''
                    output += c.lower()
            new_row=[]
            new_row.append(output.rstrip('\n'))
            new_row.append('?')
            rows.append(new_row)
    with open("new_test.csv", "w") as csvFile:

        # Write file
        csvwriter = csv.writer(csvFile)

        csvwriter.writerow(['text', 'class'])

        csvwriter.writerows(rows)
Ejemplo n.º 4
0
import math
import sys
import re
from stemming import PorterStemmer

p = PorterStemmer()

# control values
stem = False
stopwords = False

stopList = []
wordList = []
numberList = []

docInfo = {}
documentList={}
contextList = {}
frequencyList = {}
locationList = {}
totalList = {}

# check if number to remove numbers and titles
def contains_digits(s):
    return any(char.isdigit() for char in s)

# read doc
def readDoc(doc, context):
    lists = doc.split(" ")
    for w in range(len(lists)):
        i = lists[w]
 def __init__(self, stop_list_filename, format_type="vectorial"):
     self.stop_list = map(str.rstrip, open(stop_list_filename, "r").readlines())
     self.stemmer = PorterStemmer()
     self.format_type = format_type
class Process_query:
    def __init__(self, stop_list_filename, format_type="vectorial"):
        self.stop_list = map(str.rstrip, open(stop_list_filename, "r").readlines())
        self.stemmer = PorterStemmer()
        self.format_type = format_type

    def format_query(self, query):
        if self.format_type == "vectorial" or self.format_type == "probabilistic":
            return self._create_vectorial_query_from_string(query)
        elif self.format_type == "boolean":
            return self._create_boolean_query_from_json(query)
        else:
            raise ValueError("Unsupported query type!")

    def _create_vectorial_query_from_string(self, query_string):
        return self._vectorial_stem_elements_from_list(
            self._remove_common_words_from_list(re.findall("\w+", query_string.lower()))
        )

    def _create_boolean_query_from_json(self, query_string):
        """
        We only accept NDF queries, ie, a disjunction of conjunctions of terms (possibly negated with NOT).
        The valid accepted format is a string of NDF form.
        Examples:
            'computer AND series OR NOT conclusion AND testing'
            'study OR preprocessing'
            'IBM AND simulation'

        Query will be processed by a stemmer and common words will be removed, so there is no need to put them into the query.
        Empty list queries or clauses will return nothing.
        For instance, [[], ['another', 'nonrational', 'model']] is equivalent to [['another', 'nonrational', 'model']],
        which, after stemming + common-words removal, will give [['nonrat', 'model']]
        """
        query_list = self._byteify(map(lambda x: x.split(" and "), query_string.split(" or ")))

        if not self._check_valid_query(query_list):
            raise ValueError("The query does not have a valid format")

        return self._sanitize_boolean_query(query_list)

    def _byteify(self, input):
        """
        Transforms unicode objects from JSON decode to UTF-8 ones.
        Copied from stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-ones-from-json-in-python#answer-13105359
        """
        if isinstance(input, dict):
            return {self._byteify(key): self._byteify(value) for key, value in input.iteritems()}
        elif isinstance(input, list):
            return [self._byteify(element) for element in input]
        elif isinstance(input, unicode):
            return input.encode("utf-8")
        else:
            return input

    def _check_valid_query(self, query_list):
        if type(query_list) is not list:
            return False

        for element in query_list:
            if type(element) is not list or not self._check_only_strings_in_list(element):
                return False

        return True

    def _check_only_strings_in_list(self, element_list):
        for element in element_list:
            if type(element) is not str:
                return False

        return True

    def _sanitize_boolean_query(self, query_list):
        # Stem the elements, remove the common ones
        # For speed reasons, first remove common words, then stem and remove common words
        return map(
            lambda element: self._boolean_stem_elements_from_list(self._boolean_remove_common_words_from_list(element)),
            query_list,
        )

    def _boolean_remove_common_words_from_list(self, word_list):
        # print '_boolean_remove_common_words_from_list', word_list
        return [element for element in word_list if not self._boolean_should_delete(element)]

    def _vectorial_stem_elements_from_list(self, word_list):
        return map(lambda x: self.stemmer.stem(x, 0, len(x) - 1), word_list)

    def _boolean_stem_elements_from_list(self, word_list):
        # print '_boolean_stem_elements_from_list', word_list
        for i in xrange(len(word_list)):
            if self._is_real_word(word_list[i]):
                word_list[i] = self.stemmer.stem(word_list[i], 0, len(word_list[i]) - 1)
            else:
                word_list[i] = "not " + self.stemmer.stem(word_list[i][4:], 0, len(word_list[:4]) - 1)

        return word_list

    def _stem_elements_from_list(self, query_words):
        return map(lambda x: self.stemmer.stem(x, 0, len(x) - 1), query_words)

    def _remove_common_words_from_list(self, query_words):
        return [word for word in query_words if word not in self.stop_list]

    def _boolean_should_delete(self, element):
        if self._is_real_word(element):
            real_element = element
        else:
            real_element = element[4:]

        return real_element in self.stop_list

    def _is_real_word(self, element):
        return element[:4] != "not "
 def __init__(self, stop_list_filename):
     self.stop_list = map(str.rstrip, open(stop_list_filename, 'r').readlines())  # Removing \n at the end of each word
     self.stemmer = PorterStemmer()