Ejemplo n.º 1
0
def namecheck(inp):
    m = NameDataset()
    count = 0
    inp = str(inp)
    if m.search_first_name(inp) == False:
        if m.search_last_name(inp) == False:
            return False
        elif m.search_last_name(inp) == False:
            return False
        else:
            return True
    elif m.search_first_name(inp) == False:
        return False
    else:
        return True
Ejemplo n.º 2
0
def main():
    m = NameDataset()
    if os.path.isfile(sys.argv[1]):
        words = read_dict_file(sys.argv[1])
    else:
        words = [sys.argv[1]]

    # cheap word tokenizer.
    words = ' '.join(words).replace('.',
                                    ' ').replace('?',
                                                 ' ').replace('\'',
                                                              ' ').split(' ')
    output = ''
    for word in words:
        if m.search_first_name(word, use_upper_case=True):
            output += '\e[44m'
            output += word
            output += '\e[0m'
        elif m.search_last_name(word, use_upper_case=True):
            output += '\e[46m'
            output += word
            output += '\e[0m'

        else:
            output += word
        output += ' '
    print(output)
def remove_name(input_text_or_list: Union[str, List[str]]) -> List[str]:
    """ Remove name in the input text """
    name_searcher = NameDataset()
    if isinstance(input_text_or_list, str):
        tokens = word_tokenize(input_text_or_list)
        processed_tokens = [
            token for token in tokens
            if (not name_searcher.search_first_name(token)) and (
                not name_searcher.search_last_name(token))
        ]
    else:
        processed_tokens = [
            token for token in input_text_or_list
            if (not name_searcher.search_first_name(token)) and (
                not name_searcher.search_last_name(token))
            and token is not None and len(token) > 0
        ]
    return processed_tokens
def calculate_query_TFIDF(query_string, inverted_index, num_files, profile):
    # List of words to remove words from profile text that appear often but have no bearing on user's likes/dislikes
    words_to_remove = ["birthday", "bday", "facebook", "lol", "thank", "christmas", "hanukkah", "happy"]

    # First we must preprocess the query (social media profile)
    m = NameDataset()
    tokens = nltk.word_tokenize(query_string)                           # Tokenizes the string using NLTK
    tokens = [x for x in tokens if x not in string.punctuation]         # Don't include punctuation
    query_tokens = remove_stopwords(tokens)                             # Remove the stopwords

    # Only includes words that are: 1.) In English 2.) Not in  words_to_remove 3.) Not a first name or last name
    query_tokens = [x for x in query_tokens if (wordnet.synsets(x) and x not in words_to_remove and
                                                not m.search_first_name(x)) and not m.search_last_name(x)]

    query_tokens = stem_words(query_tokens)                             # Stem words for preprocessing

    for i in range(0, len(query_tokens)):                               # Converts all tokens to lowercase
        query_tokens[i] = query_tokens[i].lower()

    query_tokens = [x for x in query_tokens if x != 'birthdai']         # Makes sure this common word doesn't appear
    query_appearances = collections.Counter()
    query_weights = [0] * len(inverted_index)                           # Initialize vector to hold query weights
    query_length = 0.0
    l = list(inverted_index.keys())                                     # Gets list of tuples (query_term, index)

    for query_token in query_tokens:                                    # Counter that keeps track of word appearances
        query_appearances[query_token] += 1

    # Iterate through each term in the query vector and assign nonzero weight if the term appears in inverted index
    for query_term in query_appearances:
        if query_term in inverted_index:
            index_of_word = l.index(query_term)                         # Since ordered dict, calculate index of term
            num_postings = inverted_index[query_term].length + 0.0      # Document frequency
            idf = math.log10(num_files / num_postings)                  # Inverse document frequency
            tf = query_appearances[query_term]                          # Term frequency
            query_weights[index_of_word] = tf * idf                     # Query weight
            query_length += (tf * idf) * (tf * idf)                     # Update running total for query length

    query_length = math.sqrt(query_length)                              # Calculate final query length

    # Writes the query data to pickles
    pickle_out = open("data/"+profile+"/query_appearances.pickle", "wb")
    pickle.dump(query_appearances, pickle_out)
    pickle_out.close()

    pickle_out2 = open("data/" + profile + "/query_weights.pickle", "wb")
    pickle.dump(query_weights, pickle_out2)
    pickle_out2.close()

    return (query_weights, query_length, query_appearances)             # Returns the tuple of necessary data
Ejemplo n.º 5
0
    def fetch_training_data(keywords):

        frequent = TextClassifier.read_frequent_words()
        nd = NameDataset()
        #frequent.extend(TextClassifier.read_frequent_names())

        # k is a list of keywords for a specific category
        training = []
        for k in keywords:
            urls = TextClassifier.fetch_urls(k, 10)
            text = []
            for url in urls:
                text.extend(TextClassifier.scrape_text(url))
            text = [
                t.lower() for t in text if t.lower() not in frequent
                and not nd.search_first_name(t) and not nd.search_last_name(t)
            ]
            print(text[0:100])
            print(len(text))
            training.append(text)
        return training
Ejemplo n.º 6
0
    def wh_finder(self, entity_lst):
        # Stemmer
        porter = PorterStemmer()

        # All our question words
        wh_words = [
            'who', 'when', 'where', 'what', 'how many', 'what language',
            'what percentage', 'which', 'what time'
        ]

        # Language model to detect languages
        language_model = open(
            os.path.join(settings.BASE_DIR,
                         'question_extractor/extract/models/language_model'),
            'rb')
        language_model = pickle.load(language_model)
        # Location model to detect locations
        location_model = open(
            os.path.join(settings.BASE_DIR,
                         'question_extractor/extract/models/location_model'),
            'rb')
        location_model = pickle.load(location_model)
        # Names dataset
        m = NameDataset()  # Names Dataset

        i = 0
        word_scores = self.word_scores
        maxxy = -10

        while i < len(self.answer):
            # Stem and lower words
            word = porter.stem(self.answer[i][0].lower())
            if word in language_model:
                # Checking for languages, usually tagged as JJ. Tag as NN.
                self.answer[i] = (self.answer[i][0], 'NN', self.answer[i][2])
            # Check for non-stop word and Nouns and digits
            if self.answer[i][0].lower() not in stop_words and (
                    self.answer[i][1] == 'NNP' or self.answer[i][1] == 'NNPS'
                    or self.answer[i][1] == 'NN' or self.answer[i][1] == 'NNS'
                    or self.answer[i][1] == 'CD') and re.match(r'[\w]', word):
                # Maxxy is the highest score word, so get the most important
                if word_scores[word] > maxxy:
                    if self.answer[i][0] not in entity_lst:
                        maxxy = word_scores[word]
                        j = i
                    # Make sure sentence is not excluded due to duplicate entity
                    else:
                        replace = i

            i += 1

        try:
            j = j
        except:
            j = replace

        word = self.answer[j]
        self.entity = word[0]
        self.position = j

        # If word is noun phrase, plural or noun
        if word[1] == 'NNP' or word[1] == 'NNPS' or word[1] == 'NN' or word[
                1] == 'NNS':
            # Four options; Location(Where), Language(What language), Name(who), other(what)

            # Location
            if word[0].lower() in location_model:
                self.wh_word = wh_words[2]
                self.type = 'location'

            # Language of some kind
            elif word[0].lower() in language_model:
                self.wh_word = wh_words[5]
                self.type = 'language'

            # Name
            elif m.search_first_name(word[0].lower()) or m.search_last_name(
                    word[0].lower()):
                self.wh_word = wh_words[0]
                self.type = 'name'

            # What if non of above
            else:
                self.wh_word = wh_words[3]
                self.type = 'thing'

        # if word is digits.
        elif word[1] == 'CD':
            # Time
            if re.match(r'([12][\d][:.][0-6][\d])+(am|pm)?', word[0]):  # Time
                self.wh_word = wh_words[8]  # What time
                self.type = 'time'
            # Date
            elif re.match(
                    r'[1,2,3][\d][/.][01][\d]?[\d]?[/.][10][\d][\d]?[\d]?',
                    word[0]):  #Date

                self.wh_word = wh_words[1]  # When
                self.type = 'date'
            # %
            elif '%' in word[0]:
                self.wh_word = wh_words[6]  # What %
                self.type = '%'
            # Day of month or placement
            elif re.match(r'[\d][\d]?(st|nd|rd|th)', word[0]):
                # Date with month
                months = [
                    "january", "february", "march", "april", "may", "june",
                    "july", "august", "september", "october", "november",
                    "december"
                ]
                if self.answer[j + 1][0].lower() in months or self.answer[
                        j + 2][0].lower() in months:
                    self.wh_word = wh_words[1]  # When
                    self.type = 'day_of_month'

                else:
                    self.wh_word = wh_words[7]  # Which
                    self.type = 'placement'
            # How many if non of above
            else:
                self.wh_word = wh_words[4]  # How many
                self.type = 'quantity'
!pip install names-dataset
from names_dataset import NameDataset
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stopwords_english
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

m = NameDataset()
movie_words = ["movie", "film", "plot", "begins", "opens", "starts", "piece", "named", "woman", "women", "man", "men", "prologue", "help", "helping"]
stop_words = []
for i in range(5000):
    title = word_tokenize(corpus[i])
    titlecopy = []
    for word in title:
        if (m.search_first_name(word) or m.search_last_name(word) ) and word not in stop_words:
            stop_words.append(word)

stop = list(stop_words) + list(movie_words)

"""## Μετατροπή σε TFIDF

Το πρώτο βήμα θα είναι λοιπόν να μετατρέψετε το corpus σε αναπαράσταση tf-idf:
"""

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
corpus_tf_idf = vectorizer.transform(corpus)

"""Η συνάρτηση [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) όπως καλείται εδώ **δεν είναι βελτιστοποιημένη**. Οι επιλογές των μεθόδων και παραμέτρων της μπορεί να έχουν **δραματική επίδραση στην ποιότητα των συστάσεων** και είναι διαφορετικές για κάθε dataset. Επίσης, οι επιλογές αυτές έχουν πολύ μεγάλη επίδραση και στη **διαστατικότητα και όγκο των δεδομένων**. Η διαστατικότητα των δεδομένων με τη σειρά της θα έχει πολύ μεγάλη επίδραση στους **χρόνους εκπαίδευσης**, ιδιαίτερα στη δεύτερη εφαρμογή της άσκησης. Ανατρέξτε στα notebooks του εργαστηρίου και στο [FAQ](https://docs.google.com/document/d/1hou1gWXQuHAB7J2aV44xm_CtAWJ63q6Cu1V6OwyL_n0/edit?usp=sharing) των ασκήσεων.
Ejemplo n.º 8
0
    with open('data/scraped_results_1615315764360-out.csv',
              'w') as csv_file_output:
        csv_reader = csv.reader(csv_file_input, delimiter=',')
        writer = csv.writer(csv_file_output, lineterminator='\n')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Column names are {", ".join(row)}')
                writer.writerow(row)
                line_count += 1
            else:
                # print(f'\t{row[0]} works in the {row[1]} department, and was born in {row[2]}.')
                firstAndLastName = row[0].split()
                m = NameDataset()
                firstNameRes = m.search_first_name(firstAndLastName[0])
                lastName = m.search_last_name(firstAndLastName[1])

                if firstNameRes == True:
                    print("First Name is Valid", firstNameRes)
                    row[1] = firstAndLastName[0]

                if lastName == True:
                    print("last name is valid", lastName)
                    row[2] = firstAndLastName[1]

                writer.writerow(row)

                # print(firstNameRes, lastName)

                # print(firstAndLastName[0])
                line_count += 1
Ejemplo n.º 9
0
def is_name(word):
    m = NameDataset()
    return m.search_first_name(word) & m.search_last_name(word)
Ejemplo n.º 10
0
from names_dataset import NameDataset

m = NameDataset()
print(m.search_first_name('Brian'))
print(m.search_last_name('Remy'))
Ejemplo n.º 11
0
for tweet in cleaned_tweets:
    # tokenizing
    tokenized_tweets.append(word_tokenize(tweet))

names_list = []
# removing stopwords from tokenized tweets
stop_words.update(
    ["congratulations", "winner", "oscar", "oscars", "last", "night", "rt", "sent", "dm", "big", "best", "new", "tweet",
     "happy"])

for tweet in tokenized_tweets:
    temp = []
    for word in set(tweet):
        if word.lower() not in stop_words:
            # checking if that word is a name of person
            if m.search_first_name(word.lower()) or m.search_last_name(word.lower()):
                temp.append(word.lower())
                # joining first name and last name
                if len(temp) > 1:
                    temp = [" ".join(temp)]
    # adding name in list if present
    if len(temp) != 0:
        if temp not in names_list:
            if isinstance(temp, list):
                names_list.append(temp[0])
            else:
                names_list.append(temp)

print("Names of winners: ")
print(set(names_list))