def flair_predictor(input_url): submission = reddit.submission(url=input_url) input = [] input.append([submission.title, submission.link_flair_text]) df = pd.DataFrame(input, columns=['title', 'flair']) df['title'] = preprocessor(df['title']) text = df['title'] topic = df['flair'] input_tf = pickled_CV_model.transform(text) input_tfidf = pickled_TF_model.transform(input_tf) output_SVM = pickled_SVM_model.predict(input_tfidf) output_NB = pickled_NB_model.predict(input_tfidf) output_RFC = pickled_RFC_model.predict(input_tf) score_svm = np.max(pickled_SVM_model.decision_function(input_tfidf)) score_nb = np.max(pickled_NB_model.predict_proba(input_tfidf)) score_rfc = np.max(pickled_RFC_model.predict_proba(input_tf)) if score_nb > score_svm and score_nb > score_rfc: print("predicted flair is ", output_NB) output = output_NB elif score_svm > score_rfc: print("predicted flair is ", output_SVM) output = output_SVM else: print("predicted flair is ", output_RFC) output = output_RFC return output
def demo(args,noise=0,test_char=42): # Input the number of the character if args.gui: args.test_char = test_char else: test_char = 0 while test_char < 1 or test_char > args.char_max: test_char = int(input(f"Input the number of character (1-{args.char_max}):")) if test_char < 1 or test_char > args.char_max: print("Please input the number in the correct range!") else: args.test_char = test_char break # Input the upper limit of the noise range if args.gui: args.noise = [-1 * noise, noise] else: noise = -1 while noise < 0 or noise > args.noise_max: noise = float(input(f"Input the upper limit of the noise range (0-{args.noise_max}):")) if noise < 0 or noise > args.noise_max: print("Please input the number in the correct range!") else: args.noise = [-1 * noise, noise] break if os.path.exists(args.test_path): shutil.rmtree(args.test_path) if os.path.exists(args.save_path): shutil.rmtree(args.save_path) exe_stat.append( preprocessor(args) ) print('\n===================================================') exe_stat.append( demo_test(args) ) print('\n===================================================') exe_stat.append( postprocessor(args) ) print('\n===================================================') exe_stat.append( verification(args) ) if args.usb_path != None: print('\n===================================================') exe_stat.append( demo_post(args) ) print('\n===================================================') print(f'Testing number {args.test_char} with noise {args.noise}, Done!!!')
def main(): dataset = pd.read_csv("./data/train.csv") # 데이터를 전처리합니다. X_train, Y_train = preprocessor(dataset, fill_age_with='advanced_median', dropPassengerID=True, dropName=True) print(X_train) print(Y_train)
def preprocess_query(query): query_df = {} query = query.lower() query_words = query.split(' ') for qword in query_words: text = preprocessor(qword) if (text == "a"): continue if text not in query_df: query_df[text] = 1 else: query_df[text] += 1 return query_df
def main(): print("Begin program") createFolders(["plots"]) path = "~/QCD_Flat_15_7000_correct/" QCDTrain = getSamples([ path + "trackingNtuple.root", path + "trackingNtuple2.root", path + "trackingNtuple3.root", path + "trackingNtuple4.root" ]) weights = domainAdaptationWeights(QCDTrain, "datasets/T5qqqqWW.root") preproc = preprocessor(0.05, 0.95) preproc.fit(QCDTrain.loc[:, inputVariables + ["trk_algo"]]) QCDTrainPreprocessed = preproc.process(QCDTrain.loc[:, inputVariables + ["trk_algo"]]) means, scales = preproc.getMeansAndScales() #The outputs of these printouts are to be used as the cutoff values when evaluating the #deployed model in CMSSW. See RecoTracker/FinalTrackSelectors/plugins/TrackTFClassifier.cc print(preproc.variableNamesToClip) print("Upper cutoffs: ", np.round(preproc.upperThresholds.to_numpy(), 3).tolist()) print("Lower cutoffs: ", np.round(preproc.lowerThresholds.to_numpy(), 3).tolist()) classifier = createClassifier(len(QCDTrainPreprocessed.columns), means, scales) classifier.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-3, amsgrad=True), metrics=[tf.keras.metrics.AUC(name="auc")], loss="mse") classifier.fit(QCDTrainPreprocessed.to_numpy(), QCDTrain.loc[:, "trk_isTrue"], sample_weight=weights, epochs=50, batch_size=1024, validation_split=0.1) #Saving model in case later need for additional plotting arises classifier.save('./model.h5') #Create quick set of plots to get an idea of the performance #The true plots have to be done in CMSSW createClassifierPlots(classifier, preproc) #Model for deployment using CMSSW TF C++API createFrozenModel(classifier)
def add_article(): """ This method asserts whether the supplied article to be added exists in the list `articles`, and if not, appends the new document to the list. :return: Re-renders the main template for the NewsQuery page, updating the variable `article_added` with a String result if the article to be added was successfully appended to the list `articles`, or not. """ new_article = preprocessor(request.form.get('article_to_add')) article_added = "Document already exists; article not added." if not _article_exists(new_article): articles.append(new_article) article_added = "New article added!" print(articles) return render_template('newsquery.html', num_articles=len(articles), article_added=article_added)
def main(): # channel_name = variaveis[1] # url_channel = variaveis[2] url_channel = 'https://www.youtube.com/c/LionBBQ/videos' channel_name = 'lion_bbq' json_key = f'{json_key_path}{json_key_name}' df_raw = processor_youtube_crawler(url_channel, channel_name) df_preprocessed = preprocessor(df_raw, channel_name, 'comment') df_classified = classification_model(df_preprocessed, channel_name, 'comment_lematized') save_gbq(df_classified, channel_name, 'classified_data', json_key)
def inv_index(): invr_index = {} doc_no = 0 for root, dirs, files in os.walk(os.getcwd() + "/cranfieldDocs"): doc_no = len(files) for filename in files: fl = open(os.getcwd() + "/cranfieldDocs/" + filename, 'r') text = fl.read() # Read entire file into a string substring = text[text.find("<TITLE>") + len("<TITLE>"): text.find("</TITLE>")] \ + text[text.find("<TEXT>") + len("<TEXT>"): text.find("</TEXT>")] token = preprocessing.preprocessor(substring) fl.close() for a in token: if a in invr_index: if int(filename[-4:]) in invr_index[a]: invr_index[a][int(filename[-4:])] += 1 else: invr_index[a][int(filename[-4:])] = 1 else: invr_index[a] = {int(filename[-4:]): 1} return invr_index, doc_no
import sys import numpy as np sys.path.insert(0, '/media/storage/Projects/ML_SuperPower/') import FeedForwardNeuralNetwork as FFNN import preprocessing as pp ################################################## ################################################## #### Test Restore Predict Models #### ################################################## ################################################## SETNAME = "output_2010-2017.csv" proc = pp.preprocessor() proc.load(datafile="processedData/" + SETNAME) NN = FFNN.FFNN() procTemp = pp.preprocessor() procTemp.data.append(proc.data[0].copy()) procTemp.drop(conditionString="df[\"Batter\"] != \"Jose Altuve\"") procTemp.drop(["Batter", "Date_1", "Pitcher", "Weather", "Class"], axis=1) dropCol = [] for col in procTemp.data[0].columns: if "Unnamed" in col: dropCol.append(col) if (len(dropCol) > 0): procTemp.drop(dropCol, axis=1)
import yaml import os import logging from preprocessing.preprocessor import * from knowledge_extraction.extractor import * from graph_maker.graph_maker import * from background_knowledge.background import * with open(os.path.join('config', 'config.yaml'), 'r', encoding='utf8') as f: config = yaml.load(f, Loader=yaml.FullLoader) LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' logging.basicConfig(format=LOG_FORMAT, level=getattr(logging, 'INFO')) logger = logging.getLogger(__name__) logger.info('Config: %s' % config) subtitles_dir = config['preprocessing']['substitle_file'] preprocessor = preprocessor(config) preprocessor.save_output() extractor = extractor(config, preprocessor.output) extractor.save_output() back_KB = background_knowledge(config) graph_maker = graph_maker(config, extractor.output, back_KB.output)
with open(archive_path + 'commandline_args.yaml', 'w') as f: yaml.dump(args.__dict__, f) preproc, modelling, visualization = args.pipeline if preproc: # Parsing df_pd = cftm_parser.parquet_transform(parquet_path1, parquet_path2, n=args.observation_n) # Pre-processing stopwords = list(STOP_WORDS) texts, dictionary, corpus = pp.preprocessor(df_pd, stopwords=stopwords, language='de', text='TEXT', metadata=args.agg_metadata, min_len=args.agg_length) training_data = { "texts": texts, "dictionary": dictionary, "corpus": corpus } pickle.dump(training_data, open(data_path, 'wb')) pickle.dump(training_data, open(archive_path + ntpath.split(data_path)[1], 'wb')) elif modelling or visualization: try: training_data = pickle.load(open(data_path, 'rb')) texts, dictionary, corpus = training_data['texts'], training_data[ 'dictionary'], training_data['corpus']
################################################## ################################################## #### Create Ensemble input by running #### #### test data through base models #### ################################################## ################################################## SETNAME = "output_2010-2018_test.csv" TRAINNAME = "ENSEMBLE_INPUT.csv" #Below used in testing values testFileNamePit = "data/testFilePitch.csv" testFileNameBat = "data/testFileBat.csv" #Load test data into preprocessor proc = pp.preprocessor() proc.load(datafile="processedData/" + SETNAME) #Load all Batters and Pitchers with open("data/2018_Batters_All.txt", "r") as f: batterRead = f.read().splitlines() with open("data/2018_Pitchers_All.txt", "r") as f: pitcherRead = f.read().splitlines() modelAccuracyFile = open("models/ACCURACY.txt", "r") modelAccuracyRead = modelAccuracyFile.readlines() modelAccuracyFile.close() modelAccuracies = {} for accLine in modelAccuracyRead:
import preprocessing as preprocess import modeling if __name__ == "__main__": pre_proc = preprocess.preprocessor(from_file=True) X, y = pre_proc.preprocess() # preprocess.EDA().scatter_plot(X , y) modeling.tree_classifiers(X, y).test() modeling.logistic_classifier(X, y).test() # print(pre_proc)
.appName("Pipeline_Naive_Bayes")\ .config("spark.driver.maxResultSize", "3g")\ .getOrCreate() sc = spark.sparkContext #load data X_train_file = "./data/X_train_large.txt" y_train_file = "./data/y_train_large.txt" train_data = sc.textFile(X_train_file) train_labels = sc.textFile(y_train_file) X_test_file = "./data/X_test_large.txt" test_data = sc.textFile(X_test_file) #process data preprocessor = preprocessor(bigrams=True,stemming=True,tfidf=True,min_df=3) train = preprocessor.transform(train_data,train_labels) test = preprocessor.transform(test_data,train=False) #fit nb nb = naive_bayes() nb.fit(train,labelcol='CCAT') test = nb.predict(test,outputcol='CCAT_nb') nb.fit(train,labelcol='ECAT') test = nb.predict(test,outputcol='ECAT_nb') nb.fit(train,labelcol='GCAT') test = nb.predict(test,outputcol='GCAT_nb') nb.fit(train,labelcol='MCAT')
#import visualizeNet as vis import os import pandas as pd import sys sys.path.insert(0, '/media/storage/Projects/ML_SuperPower/') import preprocessing as pp import FeedForwardNeuralNetwork as FFNN import time #SETNAME = "output_2010-2017.csv" SETNAME = "ENSEMBLE_INPUT.csv" proc = pp.preprocessor() proc.load(datafile="processedData/" + SETNAME) dropCol = [] for col in proc.data[0].columns: if "Unnamed" in col: dropCol.append(col) if (len(dropCol) > 0): proc.drop(dropCol, axis=1) proc.data[0] = proc.data[0].dropna(axis=0, how='any') proc.drop(["Pitcher"], axis=1) proc.shuffle() ''' procTemp = pp.preprocessor() procTemp.data.append(proc.data[0].copy()) split = round((len(proc.data[0])/10)*9)
from hAutomata import HFSA13, HFSA14, HFSA15, HFSA16, SimpleFSA ####MAIN PROGRAM#### infile = codecs.open(sys.argv[1], 'r', 'utf-8') outfile = codecs.open(sys.argv[2], 'w', 'utf-8') lines = infile.readlines() infile.close() #get a verse selector: use this to select and process a random subset of verses #sel = selector() #selection = sel.select(lines, 1) #get a preprocessor prep = preprocessor() #hierarchical FSAs for syllable-wise spondeus search (à la Papakitsos 2011) hfsa13 = HFSA13('hfsa13') hfsa14 = HFSA14('hfsa14') hfsa15 = HFSA15('hfsa15') hfsa16 = HFSA16('hfsa16') #simple FSA for vowel-wise analysis in case of errors simple = SimpleFSA('simple') for line in lines: #for line in selection: scansion = '' synizesis = False solutionLength = 0 correctionLength = 0
dc_len[keys] = dc_len[keys] + (Inv_Index[i][keys] * math.log10( Doc_no / len(Inv_Index[i])))**2 else: dc_len[keys] = (Inv_Index[i][keys] * math.log10(Doc_no / len(Inv_Index[i])))**2 for d in dc_len: dc_len[d] = math.sqrt(dc_len[d]) return dc_len Doc_Len = doc_len() # Creating list of lists where each inner list represent a query with open("queries.txt") as f: content = f.readlines() query = [preprocessing.preprocessor(x.strip()) for x in content] # Creating relevant pairs of (query id, document id) from relevance.txt with open("relevance.txt") as f: content = f.readlines() list1 = [x.strip().split(" ") for x in content] relevant = [[int(x[0]), int(x[1])] for x in list1] # Computing precision at top k retrieved documents def precision(relevant_l, rank_pair_l, k): num1 = 0 for r in rank_pair_l[:k]: if r in relevant_l: num1 += 1 pre = num1 / k
type=str, help='input raw file with \\n sentence separation') args = parser.parse_args() if not args: parser.print_usage() sys.exit(1) nlp = spacy.load(args.spacy_model) stopWords = set(stopwords.words(args.nltk_stopwords)) raw_text = args.input_file with open(raw_text, 'r', encoding='utf-8') as f: text = f.readlines() clean_corpus = preprocessor(text, nlp, stopWords) tfidfvect = TfidfVectorizer() agglomerative = AgglomerativeClustering(n_clusters=args.n_clusters, affinity='euclidean', linkage='ward') tfidfmatrix = tfidfvect.fit_transform(clean_corpus) aggclusters = agglomerative.fit(tfidfmatrix.toarray()) print(pd.DataFrame(aggclusters.labels_, clean_corpus))
def process_url(current_url, url_queue, urls_crawled, total_words, page_graph): page = requests.get(current_url) page_urls = [] a_tags = [] try: if (page.status_code == 200 ): # status code 200 indicates page is retrieved successfully response = urlopen(current_url) soup = BeautifulSoup(response, 'lxml') a_tags = soup.find_all('a', href=True) except Exception as e: print('Failed to connect {} due to {}: '.format(current_url, e)) return # extract all the links in a URL page_urls = [] for tag in a_tags: href_link = tag.get('href') # if href_link is not None not any(ext in href_link for ext in except_extensions): if href_link.find('#'): href_link = href_link.split('#') href_link = href_link[0] if len(href_link) >= 1 and href_link[-1] != '/': href_link += '/' href_split = href_link.split('://') # checking for http and https if len(href_split) > 1 and href_split[0][:4] == 'http': if len(href_split[0]) > 4 and href_split[0][4] == 's': href_split[0] = 'http' if href_split[1][:4] == "www.": href_split[1] = href_split[1][4:] href_bits = href_split[1].split('/') if domain in href_bits[0]: page_urls.append(href_split[0] + '://' + href_split[1]) if len(href_split) == 1: if len(href_split[0]) > 1 and href_split[0][0] == '/': page_urls.append(current_url + href_split[0][1:]) # update the queue and add an edge from current URL to all URLs connected to it g.add_node( current_url, page_graph) # add node in the undirected graph for the current url for c_url in page_urls: if c_url not in urls_crawled: urls_crawled.add( c_url) # add current url to the set of crawled urls url_queue.append(c_url) # add current url to the deque g.add_edge(current_url, c_url, page_graph) soup = BeautifulSoup(page.text, 'html.parser') # parser to parse url content content = soup.find_all(text=True) clean_text = eliminate_tags(content) # to eliminate tags from the text for text in clean_text: text = text.strip() # to remove extra spaces words = tokenizer(text) # function to tokenize text for word in words: stem_word = preprocessor(word) # function to preprocess words # calculating inverted index for each word if stem_word not in inverted_index: inverted_index[stem_word] = {} inverted_index[stem_word][current_url] = 1 total_words[stem_word] = 1 else: if current_url in inverted_index[stem_word]: inverted_index[stem_word][current_url] += 1 else: inverted_index[stem_word][current_url] = 1 total_words[stem_word] += 1