Ejemplo n.º 1
0
def stemWords(tokens):
    """Stems tokens."""
    stemmer = PorterStemmer()
    stemmedWords = []
    for token in tokens:
        stemmed = stemmer.stem(token, 0, len(token) - 1)
        stemmedWords.append(stemmed)
    return stemmedWords
def stemWords(input_tokens):

    stemmer = PorterStemmer()
    stemmed_words = []
    for token in input_tokens:
        stemmed_words.append(str(stemmer.stem(token, 0, len(token) - 1)))

    return stemmed_words
Ejemplo n.º 3
0
def porter_stem(corp):
    """
    Builds a dictionary with words as keys and stems as the values.
    """
    from porterstemmer import PorterStemmer

    ps = PorterStemmer()
    psdict = {}
    for w in corp.words:
        psdict[w] = ps.stem(w)
    
    return psdict
Ejemplo n.º 4
0
def porter_stem(corp):
    """
    Builds a dictionary with words as keys and stems as the values.
    """
    from porterstemmer import PorterStemmer

    ps = PorterStemmer()
    psdict = {}
    for w in corp.words:
        psdict[w] = ps.stem(w)

    return psdict
Ejemplo n.º 5
0
def task1(input_file_name, output_file_name, stop_words_list):

    # open the input file and the list of stop words and create output file
    f_input = open(input_file_name, "r")
    f_output = open(output_file_name, "w+")
    f_stop_words = open(stop_words_list, "r")

    list_lines = f_input.readlines()
    #list of stop words
    list_stop_words = f_stop_words.readlines()
    list_stop_words = list(map(lambda x: x.strip(), list_stop_words))

    #list of document names
    list_documents = []

    ps = PorterStemmer()

    for i in range(len(list_lines)):
        list_words = []  #list of words for a line
        list_words_stemming = []  #list of stemming words for a line

        list_documents.append(list_lines[i].split()[0])

        #remove all the \t and \n
        list_lines[i] = re.sub(r'\s', " ", list_lines[i])
        #change upper cases to lower cases
        list_lines[i] = list_lines[i].lower()
        #remove numbers
        list_lines[i] = list_lines[i].translate(str.maketrans('', '', digits))
        #remove punctuations
        list_lines[i] = re.sub(r'[^a-zA-Z0-9\s]', '', list_lines[i])

        for w in list_lines[i].split()[1:]:
            if w not in list_stop_words:
                list_words.append(w)

        for y in list_words:
            list_words_stemming.append(ps.stem(y, 0, len(y) - 1))

        # Write the document name in front of the content in the output file
        f_output.write(list_documents[i] + "\t")
        # Write the content of the document in the output file
        for z in list_words_stemming:
            f_output.write(z + " ")
        f_output.write("\n")

    # Close all the file
    f_output.close()
    f_input.close()
    f_stop_words.close()
Ejemplo n.º 6
0
def stemWord(str):
    stemmer = PorterStemmer()
    return stemmer.stem(str, 0, len(str) - 1)
Ejemplo n.º 7
0
def parsetoken(db, line):
    global documents
    global tokens
    global terms
    #
    # Create instance of the porterstemmer object we will call the stemmer method in this
    # object to 'stem' the tokens extracted from the line.
    #
    p = PorterStemmer()

    # this replaces any tab characters with a space character in the line
    # read from the file
    line = line.replace('\t', ' ')
    line = line.strip()
    #line.encode('ascii', 'ignore')

    #
    # This routine splits the contents of the line into tokens
    l = splitchars(line)

    # for each token in the line process
    for elmt in l:
        # This statement removes the newline character if found
        elmt = elmt.replace('\n', '')

        # This statement converts all letters to lower case
        lowerElmt = elmt.lower().strip()

        #
        # Increment the counter of the number of tokens processed.  This value will
        # provide the total size of the corpus in terms of the number of terms in the
        # entire collection
        #
        tokens += 1

        # if the token is less than 2 characters in length we assume
        # that it is not a valid term and ignore it
        #
        if len(lowerElmt) < 2:
            continue

        #
        # if the token is in the stopwords list then do not include in the term
        # dictionary and do not index the term.
        #
        if (lowerElmt in stopwords):
            continue

        #
        # This section of code will check to see if the term is a number and will not
        # add a number to the index.  This is accomplished by attempting to convert
        # the term into an integer and assigning it to a variable.  If the term is not
        # a number meaning it contains non numeric characters this will fail and we can
        # catch this error and continue processing the term.  If the term is a number
        # it will not fail and we can then ignore the term (the continue statement will
        # continue with the next item retrieved from the 'for' statement)
        #
        try:
            dummy = int(lowerElmt)
        except ValueError:
            # Value is not a number so we can index it
            stemword = lowerElmt
        else:
            # value is a number so we will NOT add it to the index
            continue

        #
        # In this following short section of the code we call the porter stemmer code
        # that we have included in our indexer process.  This algorithm will stem the
        # the tokens which will reduce the size of our data dictionary.
        #
        lowerElmt = p.stem(stemword, 0, len(stemword) - 1)

        # if the term doesn't currently exist in the term dictionary
        # then add the term
        if not (lowerElmt in db.keys()):
            terms += 1
            db[lowerElmt] = Term()
            db[lowerElmt].termid = terms
            db[lowerElmt].docids = dict()
            db[lowerElmt].docs = 0

        # if the document is not currently in the postings
        # list for the term then add it
        #
        if not (documents in db[lowerElmt].docids.keys()):
            db[lowerElmt].docs += 1
            db[lowerElmt].docids[documents] = 0

        # Increment the counter that tracks the term frequency
        db[lowerElmt].docids[documents] += 1
    return l
Ejemplo n.º 8
0
def generate_feature_csv(
    csv_out, csv_in="bechdel_full.csv", female_word_filename=None, female_name_filename=None, verbose=False
):
    """
    Given a csv file csv_in of features, 
    """

    if verbose:
        print("Generating basic features and booleans...")

    raw_data = pd.read_csv(csv_in)
    data = pd.DataFrame(index=raw_data.index)
    data["Bechdel_pass"] = [1 if x == "pass" else 0 for x in raw_data["Bechdel_rating"]]
    data["Year"] = raw_data["Year"]

    # Only 2 films have N/A votes and ratings. I think it's OK to just zero
    # their votes/ratings here

    data["imdbRating"] = [x if x != "N/A" else 0 for x in raw_data["imdbRating"]]
    data["imdbVotes"] = [int(re.sub(",", "", x)) if x != "N/A" else 0 for x in raw_data["imdbVotes"]]

    # Adding booleans for month (not present for all releases). The thinking is
    # that movie "types" are released in seasons - blockbusters in the summer,
    # Oscar winners near year's end - and this may impact Bechdel rating.

    release_months = [
        datetime.datetime.strptime(x, "%d %b %Y").month if x != "N/A" else None for x in raw_data["Released"]
    ]
    release_months = level_booleans(release_months, "Month", zeros_ones=True)
    for col in release_months.columns:
        data[col] = release_months[col]

    # Booleans for parental rating. Uses the rating_bucket function to deal
    # with the wide variety of rating types.

    rating_buckets = [rating_bucket(x) for x in raw_data["Rated"]]
    rating_buckets = level_booleans(rating_buckets, "Rating", zeros_ones=True)
    for col in rating_buckets.columns:
        data[col] = rating_buckets[col]

    # Genre membership, this was actually easy to process because they're
    # pretty clean

    genre_membership = level_booleans(raw_data["Genre"], "Genre", sep=", ", zeros_ones=True)
    for col in genre_membership.columns:
        data[col] = genre_membership[col]

    # Runtime in minutes

    runtime_re = re.compile("((?P<hr>\d+) h){0,1} {0,1}((?P<min>\d+) min){0,1}")
    runtime_mins = []
    runtime_na = []
    for runtime_str in raw_data["Runtime"]:
        if runtime_str == "N/A":
            runtime_mins.append(0)
            runtime_na.append(1)
        else:
            runtime_match = runtime_re.match(runtime_str)
            (runtime_hr, runtime_min) = runtime_match.group("hr"), runtime_match.group("min")
            if runtime_hr is None:
                runtime_hr = 0
            if runtime_min is None:
                runtime_min = 0
            runtime_mins.append(int(runtime_hr) * 60 + int(runtime_min))
            runtime_na.append(0)
    data["Runtime"] = runtime_mins
    data["Runtime_na"] = runtime_na

    if verbose:
        print("Generating word-based features (stemmed words and female names)...")

    # Porter-stemmed titles and plot summaries, and look for "female words"
    # (like 'she', 'woman', etc.)

    if female_word_filename is not None:
        ps = PorterStemmer()
        f = open(female_word_filename, "r")
        female_stems = set([ps.stem(x.strip().lower(), 0, len(x.strip()) - 1) for x in f])
        f.close()
        has_female_word = []
        for plot in raw_data["Title"] + " " + raw_data["Plot"]:
            if plot == "N/A":
                has_female_word.append(None)
            else:
                cur_has_female_word = 0
                plot_clean = re.sub("[^\w\s]", " ", plot).lower().strip()
                plot_words = re.split("\s+", plot_clean)
                plot_stems = [ps.stem(x, 0, len(x) - 1) for x in plot_words]
                for plot_stem in plot_stems:
                    if plot_stem in female_stems:
                        cur_has_female_word = 1
                        break
                has_female_word.append(cur_has_female_word)
        data["Female_word"] = has_female_word

    # Number of female names in the actor list: 0 or 1 (and anything not
    # flagged as either should be considered 2+)

    if female_name_filename is not None:
        f = open(female_name_filename, "r")
        female_nameset = set([x.strip().lower() for x in f])
        f.close()
        has_0_female_name = []
        has_1_female_name = []
        for actor_list in raw_data["Actors"]:
            if actor_list == "N/A":
                # again this issue only comes up twice
                has_0_female_name.append(0)
                has_1_female_name.append(0)
            else:
                actor_clean = re.sub("[^\w\s]", " ", actor_list).lower().strip()
                actor_names = re.split("\s+", actor_clean)
                female_name_count = 0
                for actor_name in actor_names:
                    if actor_name in female_nameset:
                        female_name_count += 1
                if female_name_count == 0:
                    has_0_female_name.append(1)
                    has_1_female_name.append(0)
                elif female_name_count == 1:
                    has_0_female_name.append(0)
                    has_1_female_name.append(1)
                else:
                    has_0_female_name.append(0)
                    has_1_female_name.append(0)
        data["Actress_0"] = has_0_female_name
        data["Actress_1"] = has_1_female_name

    data.to_csv(csv_out, index=False)

    if verbose:
        print("Feature generation complete, output to %s." % csv_out)