def __init__(self, text): file_list = util.Create_files_list('./texts', 'xml') for file in file_list: full_file = './texts/' + file word_list = [] # TODO: find a good way to append multiple texts (for sake of word embeddings) with open(full_file) as fh: # Use beautiful soup to process the xml soupedEntry = BeautifulSoup(fh, "xml") # Take only line entries approved by pedecerto #FIXME: this might skew results soupedEntry = util.clean(soupedEntry('line')) # Create the word list by looping through each line of the text for line in range(len(soupedEntry)): # Extend the word list with the syllables of each line word_list.extend(self.Syllabify_line(soupedEntry[line])) # Clean the text (already partly done by pedecerto) word_list = self.Remove_numbers(word_list) word_list = self.Remove_element_from_list(word_list, '') word_list = self.Lowercase_list(word_list) # Store this character list for later retrieval self.character_list = word_list
def __init__(self, path, givenLine): # Create pandas dataframe column_names = [ "author", "text", "line", "syllable", "foot", "feet_pos", "length" ] # column_names = ["author", "text", "line", "syllable", "foot", "feet_pos", "length", "word_boundary", "metrical_feature"] self.df = pd.DataFrame( columns=column_names ) #FIXME: bad practise to work with self.df. Only update at the end. # Add all entries to process to a list entries = self.CreateFilesList(path, 'xml') # Process all entries added to the list for entry in entries: with open('./texts/' + entry) as fh: # Use beautiful soup to process the xml soupedEntry = BeautifulSoup(fh, "xml") # Retrieve the title and author from the xml file self.title = soupedEntry.title.string self.author = soupedEntry.author.string # Clean the lines (done by MQDQ, don't know what it does exactly) soupedEntry = util.clean(soupedEntry('line')) if givenLine == -1: # Do the entire folder for line in range(len(soupedEntry)): print('Progress on', self.author, self.title, ':', round(line / len(soupedEntry) * 100, 2), "%") # Process the entry. It will append the line to the df self.df = self.ProcessLine(soupedEntry[line], self.df) else: # Process just the given line (testing purposes). self.df = self.ProcessLine(soupedEntry[givenLine], self.df)
def save_dict_index(path): index = {} dictionary = [] raw_dictionary = {} with open(path + '/preprocessed.json') as file: data = json.load(file) # https://stackoverflow.com/questions/3002085/python-to-print-out-status-bar-and-percentage bar = progressbar.ProgressBar(maxval=len(data)) bar.start() for doc in data: words = word_tokenize(doc['title']) frequency = {} if 'body' in doc: words += word_tokenize(doc['body']) for word in words: word = word.lower() if word in raw_dictionary: raw_dictionary[word] += 1 else: raw_dictionary[word] = 1 word = clean(word, settings) if word == None or word == '': continue if word in frequency: frequency[word] += 1 else: frequency[word] = 1 for word in frequency: if word not in index: index[word] = {} index[word]['docs'] = [] dictionary.append(word) index[word]['docs'].append({ 'id': doc['id'], 'tf': frequency[word] }) bar.update(doc['id']) bar.finish() # Add VSM calculations to index num_docs = len(data) for word in index: index[word]['idf'] = math.log( num_docs / (len(index[word]['docs'])), 10) with open(path + '/dictionary.json', 'w') as outfile: json.dump(dictionary, outfile, indent=2, ensure_ascii=False) with open(path + '/raw_dictionary.json', 'w') as outfile: json.dump(raw_dictionary, outfile, indent=2, ensure_ascii=False) with open(path + '/index.json', 'w') as outfile: json.dump(index, outfile, ensure_ascii=False)
def get_table_from(page_html, cleaned=False, stripped=True): '''Returns formatted_tables, a list of DataFrame objects corresponding to tables on the page_html corresponding to a BOM page format. If that doesn't happen, returns a list of lists. The innermost lists represents rows within a table; the outer lists represents a table. The elements of the innermost lists are all strings. If cleaned is set to True, the 'clean' utility is called to try to force the table into a DataFrame object. 'clean' is set to False because the vast majority of cases on boxofficemojo don't require the extra overhead. This strips a lot of content that might be unwanted. Unwanted content include nested tables, Javascript script and option attributes, and HTML forms. If you want to retrieve this content, for whatever reason, set stripped to False, and it will return a list of the unwanted strings along with formatted_tables.''' identifier = '#dcdcdc' strainer = BeautifulSoup(page_html, parse_only=SoupStrainer( "table")) #reduces page parsing time by searching for only tables raw_tables = strainer.find_all( lambda tag: tag.name == 'table' and tag.tr.has_attr( 'bgcolor') and tag.tr['bgcolor'] == identifier ) # finds subset of tables matching our identifier unwanted_objects = [ undesired_object.extract() for table in raw_tables for undesired_object in table(["script", "option", "table", "form"]) ] # gets rid of nested tables, Javascript tags and forms formatted_tables = [ [[item.text.rstrip('\n') for item in row.find_all("td")] for row in table.find_all("tr")] for table in raw_tables ] # returns a list of lists comprising table text with clear formatting e.g. newline characters stripped, etc. try: formatted_tables = [ DataFrame(table[1:], columns=table[0]) for table in formatted_tables ] #attempts conversion to DataFrame except Exception as ex: print( "Failed to convert to DataFrame object. Returning as nested list..." ) #warns if DataFrame conversion fails print( "Error returned was: " + str(ex) ) # lets you know what the error was without preventing further execution of code if cleaned == True: from utilities import clean formatted_tables = clean( formatted_tables) # attempts to force DataFrame conversion if stripped == True: return formatted_tables else: return formatted_tables, unwanted_objects
def get_table_from(page_html,cleaned = False,stripped=True): '''Returns formatted_tables, a list of DataFrame objects corresponding to tables on the page_html corresponding to a BOM page format. If that doesn't happen, returns a list of lists. The innermost lists represents rows within a table; the outer lists represents a table. The elements of the innermost lists are all strings. If cleaned is set to True, the 'clean' utility is called to try to force the table into a DataFrame object. 'clean' is set to False because the vast majority of cases on boxofficemojo don't require the extra overhead. This strips a lot of content that might be unwanted. Unwanted content include nested tables, Javascript script and option attributes, and HTML forms. If you want to retrieve this content, for whatever reason, set stripped to False, and it will return a list of the unwanted strings along with formatted_tables.''' identifier = '#dcdcdc' strainer = BeautifulSoup(page_html,parse_only=SoupStrainer("table")) #reduces page parsing time by searching for only tables raw_tables = strainer.find_all(lambda tag: tag.name == 'table' and tag.tr.has_attr('bgcolor') and tag.tr['bgcolor'] == identifier) # finds subset of tables matching our identifier unwanted_objects = [undesired_object.extract() for table in raw_tables for undesired_object in table(["script","option","table","form"])] # gets rid of nested tables, Javascript tags and forms formatted_tables = [[[item.text.rstrip('\n') for item in row.find_all("td")] for row in table.find_all("tr")] for table in raw_tables] # returns a list of lists comprising table text with clear formatting e.g. newline characters stripped, etc. try: formatted_tables = [DataFrame(table[1:],columns = table[0]) for table in formatted_tables] #attempts conversion to DataFrame except Exception as ex: print("Failed to convert to DataFrame object. Returning as nested list...") #warns if DataFrame conversion fails print("Error returned was: "+str(ex)) # lets you know what the error was without preventing further execution of code if cleaned == True: from utilities import clean formatted_tables = clean(formatted_tables) # attempts to force DataFrame conversion if stripped == True: return formatted_tables else: return formatted_tables,unwanted_objects
def __init__(self, path): # Create pandas dataframe column_names = ["title", "line", "syllable", "length"] df = pd.DataFrame(columns=column_names) # Add all entries to process to a list entries = util.Create_files_list(path, 'xml') # Process all entries added to the list for entry in entries: with open(path + entry) as fh: # for each text, an individual dataframe will be created and saved as pickle new_text_df = copy.deepcopy(df) pickle_name = 'syllable_label_' + entry + '.pickle' # Use beautiful soup to process the xml soupedEntry = BeautifulSoup(fh, "xml") # Retrieve the title and author from the xml file text_title = str(soupedEntry.title.string) author = str(soupedEntry.author.string) # Clean the lines (done by MQDQ) soupedEntry = util.clean(soupedEntry('line')) # for line in range(len(soupedEntry)): for line in Bar('Processing {0}, {1}'.format( author, text_title)).iter(range(len(soupedEntry))): book_title = int(soupedEntry[line].parent.get('title')) # Process the entry. It will append the line to the df if not soupedEntry[line]['name'].isdigit(): continue # If our line name is not a digit, the line is uncertain. We skip over it line_df = self.Process_line(soupedEntry[line], book_title) new_text_df = new_text_df.append( line_df, ignore_index=True ) # If I greatly improve my own code, am I a wizard, or a moron? # Clean the lines that did not succeed new_text_df = self.clean_generated_df(new_text_df) util.Pickle_write(util.cf.get('Pickle', 'path'), pickle_name, new_text_df)
'baseline', 'baseline corrected', 'median correction of CR']); # Apply the correction: # (just replace the whole spectra containing the cosmic ray # with the median spectra of its' neighborhood) if len(CR_cand_ind) > 0: corrected_spectra[CR_cand_ind] = med_spectra_x[CR_cand_ind] # %% # ============================================================================= # NMF step # ============================================================================= spectra_cleaned = clean(sigma_kept, spectra_denoised, mode='area') _n_components = initialization['NMF_NumberOfComponents'] nmf_model = decomposition.NMF(n_components=_n_components, init='nndsvda', max_iter=7, l1_ratio=1) _start = time() print('starting nmf... (be patient, this may take some time...)') mix = nmf_model.fit_transform(spectra_cleaned) components = nmf_model.components_ reconstructed_spectra1 = nmf_model.inverse_transform(mix) _end = time() print(f'nmf done in {_end - _start:.2f}s') # %% # ============================================================================= # preparing the mixture coefficients
import numpy as np import pickle from utilities import clean from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score train_data = pd.read_csv('./data/train.csv') columns = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] values = {c: train_data[c] for c in columns} texts = [clean(t) for t in train_data["comment_text"]] del train_data # remove duplicates tmp = set([x for x, count in collections.Counter(texts).items() if count == 1]) tmp = [i for i, x in enumerate(texts) if x in tmp] texts = [texts[i] for i in tmp] for c in columns: values[c] = [values[c][i] for i in tmp] # remove clean comments with length greater than 4000 # (we already have too much clean comments) clean_ind = {} for i in range(len(texts)): s = 0 for c in columns:
from utilities import createDictionary from os import listdir from utilities import clean, word_freq, PMI print("Setting Data Directories") #setting raw data directories datadir = "../DataExtraction/db/cwncorpus/" files = listdir(datadir) data = [] print("Reading in Data Files") #reading in raw data files for i in files: with open(datadir+i,"r+", encoding="latin-1") as ff: data.append(ff.read()) ff.close() print("Creating Wordlists") data = " ".join(data) print(word_freq(clean(data)))
words = [] sentences = [] sentence_count = [] uniqueWords = [] totalRemoved = 0 c = 0 for idx, row in f.iterrows(): if idx % round(0.1 * len(f)) == 0: print(str(10 * c) + '%') c += 1 author, time, text = row['StudentId'], row['Date'], row['Text'] text = util.clean(text) text = text[200:] lengths.append(len(text)) uniques.append(len(set(list(text)))) tokens = sent_tokenize(text) sentence_count += [len(tokens)] sent_words = [len(util.wordProcess(x)) for x in tokens] sentences += [sum(sent_words) / len(tokens)] if len(tokens) != 0 else [0] cleanText = util.wordProcess(text) words.append(len(cleanText)) uniqueWords.append(len(set(cleanText)))
def predict(msg): predictions = {} test_vector = vectorizer.transform([clean(msg)]) for c in columns: predictions[c] = classifiers[c].predict_proba(test_vector)[:, -1][0] return jsonify(predictions)
index = json.load(indexFile) settings = json.load(settingsFile) bar = progressbar.ProgressBar(maxval=len(topics)) bar.start() for topic in topics: if topic['assigned'] == False: scores = {} word_count = {} doc = preprocessed[topic['id']] words = word_tokenize(doc['title']) if 'body' in doc: words += word_tokenize(doc['body']) for word in words: tmp_word = clean(word.lower(), settings) if tmp_word == None or tmp_word == '': continue if tmp_word in word_count: word_count[tmp_word] += 1 else: word_count[tmp_word] = 1 for word in word_count: for item in index[word]['docs']: if item['id'] != topic['id'] and 'topics' in topics[item[ 'id']]: # Only find score of objects that have topics if item['id'] in scores: scores[item['id']] += item['tf'] * index[word][ 'idf'] * word_count[word] else: scores[item['id']] = item['tf'] * word_count[word]