def __init__(self, text):

        file_list = util.Create_files_list('./texts', 'xml')

        for file in file_list:

            full_file = './texts/' + file

            word_list = []
            # TODO: find a good way to append multiple texts (for sake of word embeddings)
            with open(full_file) as fh:
                # Use beautiful soup to process the xml
                soupedEntry = BeautifulSoup(fh, "xml")
                # Take only line entries approved by pedecerto #FIXME: this might skew results
                soupedEntry = util.clean(soupedEntry('line'))
                # Create the word list by looping through each line of the text
                for line in range(len(soupedEntry)):
                    # Extend the word list with the syllables of each line
                    word_list.extend(self.Syllabify_line(soupedEntry[line]))

            # Clean the text (already partly done by pedecerto)
            word_list = self.Remove_numbers(word_list)
            word_list = self.Remove_element_from_list(word_list, '')
            word_list = self.Lowercase_list(word_list)

            # Store this character list for later retrieval
            self.character_list = word_list
Example #2
0
    def __init__(self, path, givenLine):
        # Create pandas dataframe
        column_names = [
            "author", "text", "line", "syllable", "foot", "feet_pos", "length"
        ]
        # column_names = ["author", "text", "line", "syllable", "foot", "feet_pos", "length", "word_boundary", "metrical_feature"]
        self.df = pd.DataFrame(
            columns=column_names
        )  #FIXME: bad practise to work with self.df. Only update at the end.

        # Add all entries to process to a list
        entries = self.CreateFilesList(path, 'xml')
        # Process all entries added to the list
        for entry in entries:
            with open('./texts/' + entry) as fh:
                # Use beautiful soup to process the xml
                soupedEntry = BeautifulSoup(fh, "xml")
                # Retrieve the title and author from the xml file
                self.title = soupedEntry.title.string
                self.author = soupedEntry.author.string
                # Clean the lines (done by MQDQ, don't know what it does exactly)
                soupedEntry = util.clean(soupedEntry('line'))
                if givenLine == -1:
                    # Do the entire folder
                    for line in range(len(soupedEntry)):
                        print('Progress on', self.author, self.title, ':',
                              round(line / len(soupedEntry) * 100, 2), "%")
                        # Process the entry. It will append the line to the df
                        self.df = self.ProcessLine(soupedEntry[line], self.df)
                else:
                    # Process just the given line (testing purposes).
                    self.df = self.ProcessLine(soupedEntry[givenLine], self.df)
Example #3
0
def save_dict_index(path):
    index = {}
    dictionary = []
    raw_dictionary = {}

    with open(path + '/preprocessed.json') as file:
        data = json.load(file)
        # https://stackoverflow.com/questions/3002085/python-to-print-out-status-bar-and-percentage
        bar = progressbar.ProgressBar(maxval=len(data))
        bar.start()
        for doc in data:
            words = word_tokenize(doc['title'])
            frequency = {}
            if 'body' in doc:
                words += word_tokenize(doc['body'])
            for word in words:
                word = word.lower()
                if word in raw_dictionary:
                    raw_dictionary[word] += 1
                else:
                    raw_dictionary[word] = 1

                word = clean(word, settings)
                if word == None or word == '':
                    continue
                if word in frequency:
                    frequency[word] += 1
                else:
                    frequency[word] = 1
            for word in frequency:
                if word not in index:
                    index[word] = {}
                    index[word]['docs'] = []
                    dictionary.append(word)
                index[word]['docs'].append({
                    'id': doc['id'],
                    'tf': frequency[word]
                })
            bar.update(doc['id'])
        bar.finish()
        # Add VSM calculations to index
        num_docs = len(data)
        for word in index:
            index[word]['idf'] = math.log(
                num_docs / (len(index[word]['docs'])), 10)

    with open(path + '/dictionary.json', 'w') as outfile:
        json.dump(dictionary, outfile, indent=2, ensure_ascii=False)

    with open(path + '/raw_dictionary.json', 'w') as outfile:
        json.dump(raw_dictionary, outfile, indent=2, ensure_ascii=False)

    with open(path + '/index.json', 'w') as outfile:
        json.dump(index, outfile, ensure_ascii=False)
Example #4
0
def get_table_from(page_html, cleaned=False, stripped=True):
    '''Returns formatted_tables, a list of DataFrame objects corresponding to tables on the page_html corresponding to a BOM page format.

    If that doesn't happen, returns a list of lists. The innermost lists represents rows within a table; the outer lists represents a table.
    The elements of the innermost lists are all strings.

    If cleaned is set to True, the 'clean' utility is called to try to force the table into a DataFrame object. 'clean' is set to False because
    the vast majority of cases on boxofficemojo don't require the extra overhead. 

    This strips a lot of content that might be unwanted. Unwanted content include nested tables, Javascript script and option attributes, and HTML forms.
    If you want to retrieve this content, for whatever reason, set stripped to False, and it will return a list of the unwanted strings along with formatted_tables.'''

    identifier = '#dcdcdc'

    strainer = BeautifulSoup(page_html, parse_only=SoupStrainer(
        "table"))  #reduces page parsing time by searching for only tables

    raw_tables = strainer.find_all(
        lambda tag: tag.name == 'table' and tag.tr.has_attr(
            'bgcolor') and tag.tr['bgcolor'] == identifier
    )  # finds subset of tables matching our identifier

    unwanted_objects = [
        undesired_object.extract() for table in raw_tables
        for undesired_object in table(["script", "option", "table", "form"])
    ]  # gets rid of nested tables, Javascript tags and forms

    formatted_tables = [
        [[item.text.rstrip('\n') for item in row.find_all("td")]
         for row in table.find_all("tr")] for table in raw_tables
    ]  # returns a list of lists comprising table text with clear formatting e.g. newline characters stripped, etc.

    try:
        formatted_tables = [
            DataFrame(table[1:], columns=table[0])
            for table in formatted_tables
        ]  #attempts conversion to DataFrame
    except Exception as ex:
        print(
            "Failed to convert to DataFrame object. Returning as nested list..."
        )  #warns if DataFrame conversion fails
        print(
            "Error returned was: " + str(ex)
        )  # lets you know what the error was without preventing further execution of code

    if cleaned == True:
        from utilities import clean
        formatted_tables = clean(
            formatted_tables)  # attempts to force DataFrame conversion

    if stripped == True:
        return formatted_tables
    else:
        return formatted_tables, unwanted_objects
Example #5
0
def get_table_from(page_html,cleaned = False,stripped=True):

    '''Returns formatted_tables, a list of DataFrame objects corresponding to tables on the page_html corresponding to a BOM page format.

    If that doesn't happen, returns a list of lists. The innermost lists represents rows within a table; the outer lists represents a table.
    The elements of the innermost lists are all strings.

    If cleaned is set to True, the 'clean' utility is called to try to force the table into a DataFrame object. 'clean' is set to False because
    the vast majority of cases on boxofficemojo don't require the extra overhead. 

    This strips a lot of content that might be unwanted. Unwanted content include nested tables, Javascript script and option attributes, and HTML forms.
    If you want to retrieve this content, for whatever reason, set stripped to False, and it will return a list of the unwanted strings along with formatted_tables.''' 

    identifier = '#dcdcdc'

    strainer = BeautifulSoup(page_html,parse_only=SoupStrainer("table")) #reduces page parsing time by searching for only tables

    raw_tables = strainer.find_all(lambda tag: tag.name == 'table' and
                                   tag.tr.has_attr('bgcolor') and
                                   tag.tr['bgcolor'] == identifier) # finds subset of tables matching our identifier
        
    unwanted_objects = [undesired_object.extract() for table in raw_tables
                        for undesired_object in table(["script","option","table","form"])] # gets rid of nested tables, Javascript tags and forms
        
    formatted_tables = [[[item.text.rstrip('\n') for item in row.find_all("td")]
                         for row in table.find_all("tr")] for table in raw_tables] # returns a list of lists comprising table text with clear formatting e.g. newline characters stripped, etc.
        
    try:
        formatted_tables = [DataFrame(table[1:],columns = table[0]) for table in formatted_tables] #attempts conversion to DataFrame
    except Exception as ex:
        print("Failed to convert to DataFrame object. Returning as nested list...") #warns if DataFrame conversion fails
        print("Error returned was: "+str(ex)) # lets you know what the error was without preventing further execution of code
        
    if cleaned == True:
        from utilities import clean
        formatted_tables = clean(formatted_tables) # attempts to force DataFrame conversion

    if stripped == True:
        return formatted_tables
    else:
        return formatted_tables,unwanted_objects
Example #6
0
    def __init__(self, path):

        # Create pandas dataframe
        column_names = ["title", "line", "syllable", "length"]
        df = pd.DataFrame(columns=column_names)

        # Add all entries to process to a list
        entries = util.Create_files_list(path, 'xml')
        # Process all entries added to the list
        for entry in entries:
            with open(path + entry) as fh:
                # for each text, an individual dataframe will be created and saved as pickle
                new_text_df = copy.deepcopy(df)
                pickle_name = 'syllable_label_' + entry + '.pickle'

                # Use beautiful soup to process the xml
                soupedEntry = BeautifulSoup(fh, "xml")
                # Retrieve the title and author from the xml file
                text_title = str(soupedEntry.title.string)
                author = str(soupedEntry.author.string)
                # Clean the lines (done by MQDQ)
                soupedEntry = util.clean(soupedEntry('line'))

                # for line in range(len(soupedEntry)):
                for line in Bar('Processing {0}, {1}'.format(
                        author, text_title)).iter(range(len(soupedEntry))):
                    book_title = int(soupedEntry[line].parent.get('title'))
                    # Process the entry. It will append the line to the df
                    if not soupedEntry[line]['name'].isdigit():
                        continue  # If our line name is not a digit, the line is uncertain. We skip over it
                    line_df = self.Process_line(soupedEntry[line], book_title)
                    new_text_df = new_text_df.append(
                        line_df, ignore_index=True
                    )  # If I greatly improve my own code, am I a wizard, or a moron?

                # Clean the lines that did not succeed
                new_text_df = self.clean_generated_df(new_text_df)

                util.Pickle_write(util.cf.get('Pickle', 'path'), pickle_name,
                                  new_text_df)
Example #7
0
                                                   'baseline',
                                                   'baseline corrected',
                                                   'median correction of CR']);

    # Apply the correction:
    # (just replace the whole spectra containing the cosmic ray
    # with the median spectra of its' neighborhood)
    if len(CR_cand_ind) > 0:
        corrected_spectra[CR_cand_ind] = med_spectra_x[CR_cand_ind]

# %%
# =============================================================================
#                                   NMF step
# =============================================================================

spectra_cleaned = clean(sigma_kept, spectra_denoised, mode='area')

_n_components = initialization['NMF_NumberOfComponents']
nmf_model = decomposition.NMF(n_components=_n_components, init='nndsvda',
                              max_iter=7, l1_ratio=1)
_start = time()
print('starting nmf... (be patient, this may take some time...)')
mix = nmf_model.fit_transform(spectra_cleaned)
components = nmf_model.components_
reconstructed_spectra1 = nmf_model.inverse_transform(mix)
_end = time()
print(f'nmf done in {_end - _start:.2f}s')

# %%
# =============================================================================
#                    preparing the mixture coefficients
Example #8
0
import numpy as np
import pickle
from utilities import clean

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

train_data = pd.read_csv('./data/train.csv')
columns = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]

values = {c: train_data[c] for c in columns}
texts = [clean(t) for t in train_data["comment_text"]]
del train_data

# remove duplicates
tmp = set([x for x, count in collections.Counter(texts).items() if count == 1])
tmp = [i for i, x in enumerate(texts) if x in tmp]
texts = [texts[i] for i in tmp]
for c in columns:
    values[c] = [values[c][i] for i in tmp]

# remove clean comments with length greater than 4000
# (we already have too much clean comments)
clean_ind = {}
for i in range(len(texts)):
    s = 0
    for c in columns:
Example #9
0
from utilities import createDictionary
from os import listdir
from utilities import clean, word_freq, PMI

print("Setting Data Directories")
#setting raw data directories
datadir = "../DataExtraction/db/cwncorpus/"
files = listdir(datadir)

data = []

print("Reading in Data Files")
#reading in raw data files
for i in files:
    with open(datadir+i,"r+", encoding="latin-1") as ff:
        data.append(ff.read())
    ff.close()

print("Creating Wordlists")
data = " ".join(data)

print(word_freq(clean(data)))
Example #10
0
words = []
sentences = []
sentence_count = []
uniqueWords = []

totalRemoved = 0

c = 0
for idx, row in f.iterrows():
    if idx % round(0.1 * len(f)) == 0:
        print(str(10 * c) + '%')
        c += 1

    author, time, text = row['StudentId'], row['Date'], row['Text']

    text = util.clean(text)
    text = text[200:]

    lengths.append(len(text))
    uniques.append(len(set(list(text))))

    tokens = sent_tokenize(text)
    sentence_count += [len(tokens)]
    sent_words = [len(util.wordProcess(x)) for x in tokens]

    sentences += [sum(sent_words) / len(tokens)] if len(tokens) != 0 else [0]

    cleanText = util.wordProcess(text)
    words.append(len(cleanText))
    uniqueWords.append(len(set(cleanText)))
Example #11
0
def predict(msg):
    predictions = {}
    test_vector = vectorizer.transform([clean(msg)])
    for c in columns:
        predictions[c] = classifiers[c].predict_proba(test_vector)[:, -1][0]
    return jsonify(predictions)
Example #12
0
    index = json.load(indexFile)
    settings = json.load(settingsFile)

    bar = progressbar.ProgressBar(maxval=len(topics))
    bar.start()

    for topic in topics:
        if topic['assigned'] == False:
            scores = {}
            word_count = {}
            doc = preprocessed[topic['id']]
            words = word_tokenize(doc['title'])
            if 'body' in doc:
                words += word_tokenize(doc['body'])
            for word in words:
                tmp_word = clean(word.lower(), settings)
                if tmp_word == None or tmp_word == '':
                    continue
                if tmp_word in word_count:
                    word_count[tmp_word] += 1
                else:
                    word_count[tmp_word] = 1
            for word in word_count:
                for item in index[word]['docs']:
                    if item['id'] != topic['id'] and 'topics' in topics[item[
                            'id']]:  # Only find score of objects that have topics
                        if item['id'] in scores:
                            scores[item['id']] += item['tf'] * index[word][
                                'idf'] * word_count[word]
                        else:
                            scores[item['id']] = item['tf'] * word_count[word]