Example #1
0
def flair_predictor(input_url):
    submission = reddit.submission(url=input_url)
    input = []
    input.append([submission.title, submission.link_flair_text])
    df = pd.DataFrame(input, columns=['title', 'flair'])
    df['title'] = preprocessor(df['title'])
    text = df['title']
    topic = df['flair']

    input_tf = pickled_CV_model.transform(text)
    input_tfidf = pickled_TF_model.transform(input_tf)

    output_SVM = pickled_SVM_model.predict(input_tfidf)
    output_NB = pickled_NB_model.predict(input_tfidf)
    output_RFC = pickled_RFC_model.predict(input_tf)

    score_svm = np.max(pickled_SVM_model.decision_function(input_tfidf))
    score_nb = np.max(pickled_NB_model.predict_proba(input_tfidf))
    score_rfc = np.max(pickled_RFC_model.predict_proba(input_tf))

    if score_nb > score_svm and score_nb > score_rfc:
        print("predicted flair is ", output_NB)
        output = output_NB
    elif score_svm > score_rfc:
        print("predicted flair is ", output_SVM)
        output = output_SVM
    else:
        print("predicted flair is ", output_RFC)
        output = output_RFC
    return output
Example #2
0
def demo(args,noise=0,test_char=42):

    # Input the number of the character
    if args.gui:
        args.test_char = test_char
    else:
        test_char = 0
        while test_char < 1 or test_char > args.char_max:
            test_char = int(input(f"Input the number of character (1-{args.char_max}):"))
            if test_char < 1 or test_char > args.char_max:
                print("Please input the number in the correct range!")
            else:
                args.test_char = test_char
                break
    
    # Input the upper limit of the noise range
    if args.gui:
        args.noise = [-1 * noise, noise]
    else:
        noise = -1
        while noise < 0 or noise > args.noise_max:
            noise = float(input(f"Input the upper limit of the noise range (0-{args.noise_max}):"))
            if noise < 0 or noise > args.noise_max:
                print("Please input the number in the correct range!")
            else:
                args.noise = [-1 * noise, noise]
                break

    if os.path.exists(args.test_path):
        shutil.rmtree(args.test_path)
    if os.path.exists(args.save_path):
        shutil.rmtree(args.save_path)

    exe_stat.append(
        preprocessor(args)
    )

    print('\n===================================================')
    exe_stat.append(
        demo_test(args)
    )

    print('\n===================================================')
    exe_stat.append(
        postprocessor(args)
    )

    print('\n===================================================')
    exe_stat.append(
        verification(args)
    )

    if args.usb_path != None:
        print('\n===================================================')
        exe_stat.append(
            demo_post(args)
        )

    print('\n===================================================')
    print(f'Testing number {args.test_char} with noise {args.noise}, Done!!!')
Example #3
0
def main():

    dataset = pd.read_csv("./data/train.csv")

    # 데이터를 전처리합니다.
    X_train, Y_train = preprocessor(dataset,
                                    fill_age_with='advanced_median',
                                    dropPassengerID=True,
                                    dropName=True)
    print(X_train)
    print(Y_train)
Example #4
0
def preprocess_query(query):
    query_df = {}
    query = query.lower()
    query_words = query.split(' ')

    for qword in query_words:
        text = preprocessor(qword)
        if (text == "a"):
            continue
        if text not in query_df:
            query_df[text] = 1
        else:
            query_df[text] += 1
    return query_df
Example #5
0
def main():
    print("Begin program")

    createFolders(["plots"])

    path = "~/QCD_Flat_15_7000_correct/"
    QCDTrain = getSamples([
        path + "trackingNtuple.root", path + "trackingNtuple2.root",
        path + "trackingNtuple3.root", path + "trackingNtuple4.root"
    ])
    weights = domainAdaptationWeights(QCDTrain, "datasets/T5qqqqWW.root")
    preproc = preprocessor(0.05, 0.95)
    preproc.fit(QCDTrain.loc[:, inputVariables + ["trk_algo"]])

    QCDTrainPreprocessed = preproc.process(QCDTrain.loc[:, inputVariables +
                                                        ["trk_algo"]])

    means, scales = preproc.getMeansAndScales()

    #The outputs of these printouts are to be used as the cutoff values when evaluating the
    #deployed model in CMSSW. See RecoTracker/FinalTrackSelectors/plugins/TrackTFClassifier.cc
    print(preproc.variableNamesToClip)
    print("Upper cutoffs: ",
          np.round(preproc.upperThresholds.to_numpy(), 3).tolist())
    print("Lower cutoffs: ",
          np.round(preproc.lowerThresholds.to_numpy(), 3).tolist())

    classifier = createClassifier(len(QCDTrainPreprocessed.columns), means,
                                  scales)
    classifier.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-3,
                                                          amsgrad=True),
                       metrics=[tf.keras.metrics.AUC(name="auc")],
                       loss="mse")
    classifier.fit(QCDTrainPreprocessed.to_numpy(),
                   QCDTrain.loc[:, "trk_isTrue"],
                   sample_weight=weights,
                   epochs=50,
                   batch_size=1024,
                   validation_split=0.1)

    #Saving model in case later need for additional plotting arises
    classifier.save('./model.h5')

    #Create quick set of plots to get an idea of the performance
    #The true plots have to be done in CMSSW
    createClassifierPlots(classifier, preproc)

    #Model for deployment using CMSSW TF C++API
    createFrozenModel(classifier)
Example #6
0
def add_article():
    """
    This method asserts whether the supplied article to be added exists in the list `articles`, and if not, appends the
    new document to the list.
        :return: Re-renders the main template for the NewsQuery page, updating the variable `article_added` with a
                 String result if the article to be added was successfully appended to the list `articles`, or not.
    """
    new_article = preprocessor(request.form.get('article_to_add'))
    article_added = "Document already exists; article not added."
    if not _article_exists(new_article):
        articles.append(new_article)
        article_added = "New article added!"
    print(articles)
    return render_template('newsquery.html',
                           num_articles=len(articles),
                           article_added=article_added)
Example #7
0
def main():
    # channel_name = variaveis[1]
    # url_channel = variaveis[2]

    url_channel = 'https://www.youtube.com/c/LionBBQ/videos'
    channel_name = 'lion_bbq'

    json_key = f'{json_key_path}{json_key_name}'

    df_raw = processor_youtube_crawler(url_channel, channel_name)

    df_preprocessed = preprocessor(df_raw, channel_name, 'comment')

    df_classified = classification_model(df_preprocessed, channel_name,
                                         'comment_lematized')

    save_gbq(df_classified, channel_name, 'classified_data', json_key)
def inv_index():
    invr_index = {}
    doc_no = 0
    for root, dirs, files in os.walk(os.getcwd() + "/cranfieldDocs"):
        doc_no = len(files)
        for filename in files:
            fl = open(os.getcwd() + "/cranfieldDocs/" + filename, 'r')
            text = fl.read()  # Read entire file into a string
            substring = text[text.find("<TITLE>") + len("<TITLE>"): text.find("</TITLE>")] \
                        + text[text.find("<TEXT>") + len("<TEXT>"): text.find("</TEXT>")]
            token = preprocessing.preprocessor(substring)
            fl.close()
            for a in token:
                if a in invr_index:
                    if int(filename[-4:]) in invr_index[a]:
                        invr_index[a][int(filename[-4:])] += 1
                    else:
                        invr_index[a][int(filename[-4:])] = 1
                else:
                    invr_index[a] = {int(filename[-4:]): 1}
    return invr_index, doc_no
Example #9
0
import sys
import numpy as np
sys.path.insert(0, '/media/storage/Projects/ML_SuperPower/')
import FeedForwardNeuralNetwork as FFNN
import preprocessing as pp

##################################################
##################################################
####      Test Restore Predict Models         ####
##################################################
##################################################
SETNAME = "output_2010-2017.csv"
proc = pp.preprocessor()
proc.load(datafile="processedData/" + SETNAME)

NN = FFNN.FFNN()

procTemp = pp.preprocessor()
procTemp.data.append(proc.data[0].copy())
procTemp.drop(conditionString="df[\"Batter\"] != \"Jose Altuve\"")

procTemp.drop(["Batter", "Date_1", "Pitcher", "Weather", "Class"], axis=1)

dropCol = []
for col in procTemp.data[0].columns:
    if "Unnamed" in col:
        dropCol.append(col)

if (len(dropCol) > 0):
    procTemp.drop(dropCol, axis=1)
Example #10
0
import yaml
import os
import logging
from preprocessing.preprocessor import *
from knowledge_extraction.extractor import *
from graph_maker.graph_maker import *
from background_knowledge.background import *

with open(os.path.join('config', 'config.yaml'), 'r', encoding='utf8') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
logging.basicConfig(format=LOG_FORMAT, level=getattr(logging, 'INFO'))
logger = logging.getLogger(__name__)

logger.info('Config: %s' % config)

subtitles_dir = config['preprocessing']['substitle_file']

preprocessor = preprocessor(config)
preprocessor.save_output()

extractor = extractor(config, preprocessor.output)
extractor.save_output()

back_KB = background_knowledge(config)

graph_maker = graph_maker(config, extractor.output, back_KB.output)

Example #11
0
    with open(archive_path + 'commandline_args.yaml', 'w') as f:
        yaml.dump(args.__dict__, f)

    preproc, modelling, visualization = args.pipeline
    if preproc:
        # Parsing
        df_pd = cftm_parser.parquet_transform(parquet_path1,
                                              parquet_path2,
                                              n=args.observation_n)

        # Pre-processing
        stopwords = list(STOP_WORDS)
        texts, dictionary, corpus = pp.preprocessor(df_pd,
                                                    stopwords=stopwords,
                                                    language='de',
                                                    text='TEXT',
                                                    metadata=args.agg_metadata,
                                                    min_len=args.agg_length)
        training_data = {
            "texts": texts,
            "dictionary": dictionary,
            "corpus": corpus
        }
        pickle.dump(training_data, open(data_path, 'wb'))
        pickle.dump(training_data,
                    open(archive_path + ntpath.split(data_path)[1], 'wb'))
    elif modelling or visualization:
        try:
            training_data = pickle.load(open(data_path, 'rb'))
            texts, dictionary, corpus = training_data['texts'], training_data[
                'dictionary'], training_data['corpus']
##################################################
##################################################
####     Create Ensemble input by running     ####
####      test data through base models       ####
##################################################
##################################################

SETNAME = "output_2010-2018_test.csv"
TRAINNAME = "ENSEMBLE_INPUT.csv"

#Below used in testing values
testFileNamePit = "data/testFilePitch.csv"
testFileNameBat = "data/testFileBat.csv"

#Load test data into preprocessor
proc = pp.preprocessor()
proc.load(datafile="processedData/" + SETNAME)

#Load all Batters and Pitchers
with open("data/2018_Batters_All.txt", "r") as f:
    batterRead = f.read().splitlines()

with open("data/2018_Pitchers_All.txt", "r") as f:
    pitcherRead = f.read().splitlines()

modelAccuracyFile = open("models/ACCURACY.txt", "r")
modelAccuracyRead = modelAccuracyFile.readlines()
modelAccuracyFile.close()

modelAccuracies = {}
for accLine in modelAccuracyRead:
Example #13
0
import preprocessing as preprocess
import modeling

if __name__ == "__main__":
    pre_proc = preprocess.preprocessor(from_file=True)
    X, y = pre_proc.preprocess()

    # preprocess.EDA().scatter_plot(X , y)
    modeling.tree_classifiers(X, y).test()
    modeling.logistic_classifier(X, y).test()
    # print(pre_proc)
Example #14
0
        .appName("Pipeline_Naive_Bayes")\
        .config("spark.driver.maxResultSize", "3g")\
        .getOrCreate()
sc = spark.sparkContext

#load data
X_train_file = "./data/X_train_large.txt"
y_train_file = "./data/y_train_large.txt"
train_data = sc.textFile(X_train_file)
train_labels = sc.textFile(y_train_file)

X_test_file = "./data/X_test_large.txt"
test_data = sc.textFile(X_test_file)

#process data
preprocessor = preprocessor(bigrams=True,stemming=True,tfidf=True,min_df=3)
train = preprocessor.transform(train_data,train_labels)
test = preprocessor.transform(test_data,train=False)

#fit nb
nb = naive_bayes()
nb.fit(train,labelcol='CCAT')
test = nb.predict(test,outputcol='CCAT_nb')

nb.fit(train,labelcol='ECAT')
test = nb.predict(test,outputcol='ECAT_nb')

nb.fit(train,labelcol='GCAT')
test = nb.predict(test,outputcol='GCAT_nb')

nb.fit(train,labelcol='MCAT')
#import visualizeNet as vis
import os
import pandas as pd
import sys

sys.path.insert(0, '/media/storage/Projects/ML_SuperPower/')
import preprocessing as pp
import FeedForwardNeuralNetwork as FFNN
import time

#SETNAME = "output_2010-2017.csv"
SETNAME = "ENSEMBLE_INPUT.csv"
proc = pp.preprocessor()
proc.load(datafile="processedData/" + SETNAME)

dropCol = []
for col in proc.data[0].columns:
    if "Unnamed" in col:
        dropCol.append(col)

if (len(dropCol) > 0):
    proc.drop(dropCol, axis=1)

proc.data[0] = proc.data[0].dropna(axis=0, how='any')
proc.drop(["Pitcher"], axis=1)
proc.shuffle()
'''
procTemp = pp.preprocessor()
procTemp.data.append(proc.data[0].copy())

split = round((len(proc.data[0])/10)*9)
Example #16
0
from hAutomata import HFSA13, HFSA14, HFSA15, HFSA16, SimpleFSA

####MAIN PROGRAM####

infile = codecs.open(sys.argv[1], 'r', 'utf-8')
outfile = codecs.open(sys.argv[2], 'w', 'utf-8')

lines = infile.readlines()
infile.close()

#get a verse selector: use this to select and process a random subset of verses
#sel = selector()
#selection = sel.select(lines, 1)

#get a preprocessor
prep = preprocessor()

#hierarchical FSAs for syllable-wise spondeus search (à la Papakitsos 2011)
hfsa13 = HFSA13('hfsa13')
hfsa14 = HFSA14('hfsa14')
hfsa15 = HFSA15('hfsa15')
hfsa16 = HFSA16('hfsa16')
#simple FSA for vowel-wise analysis in case of errors
simple = SimpleFSA('simple')

for line in lines:
    #for line in selection:
    scansion = ''
    synizesis = False
    solutionLength = 0
    correctionLength = 0
                dc_len[keys] = dc_len[keys] + (Inv_Index[i][keys] * math.log10(
                    Doc_no / len(Inv_Index[i])))**2
            else:
                dc_len[keys] = (Inv_Index[i][keys] *
                                math.log10(Doc_no / len(Inv_Index[i])))**2
    for d in dc_len:
        dc_len[d] = math.sqrt(dc_len[d])
    return dc_len


Doc_Len = doc_len()

# Creating list of lists where each inner list represent a query
with open("queries.txt") as f:
    content = f.readlines()
query = [preprocessing.preprocessor(x.strip()) for x in content]

# Creating relevant pairs of (query id, document id) from relevance.txt
with open("relevance.txt") as f:
    content = f.readlines()
list1 = [x.strip().split(" ") for x in content]
relevant = [[int(x[0]), int(x[1])] for x in list1]


# Computing precision at top k retrieved documents
def precision(relevant_l, rank_pair_l, k):
    num1 = 0
    for r in rank_pair_l[:k]:
        if r in relevant_l:
            num1 += 1
    pre = num1 / k
                    type=str,
                    help='input raw file with \\n sentence separation')

args = parser.parse_args()

if not args:
    parser.print_usage()
    sys.exit(1)

nlp = spacy.load(args.spacy_model)

stopWords = set(stopwords.words(args.nltk_stopwords))

raw_text = args.input_file

with open(raw_text, 'r', encoding='utf-8') as f:
    text = f.readlines()

clean_corpus = preprocessor(text, nlp, stopWords)

tfidfvect = TfidfVectorizer()
agglomerative = AgglomerativeClustering(n_clusters=args.n_clusters,
                                        affinity='euclidean',
                                        linkage='ward')

tfidfmatrix = tfidfvect.fit_transform(clean_corpus)

aggclusters = agglomerative.fit(tfidfmatrix.toarray())

print(pd.DataFrame(aggclusters.labels_, clean_corpus))
Example #19
0
def process_url(current_url, url_queue, urls_crawled, total_words, page_graph):
    page = requests.get(current_url)
    page_urls = []
    a_tags = []

    try:
        if (page.status_code == 200
            ):  # status code 200 indicates page is retrieved successfully
            response = urlopen(current_url)
            soup = BeautifulSoup(response, 'lxml')
            a_tags = soup.find_all('a', href=True)

    except Exception as e:
        print('Failed to connect {} due to {}: '.format(current_url, e))
        return

    # extract all the links in a URL
    page_urls = []
    for tag in a_tags:
        href_link = tag.get('href')

        # if href_link is not None not any(ext in href_link for ext in except_extensions):
        if href_link.find('#'):
            href_link = href_link.split('#')
            href_link = href_link[0]

        if len(href_link) >= 1 and href_link[-1] != '/':
            href_link += '/'

        href_split = href_link.split('://')

        # checking for http and https
        if len(href_split) > 1 and href_split[0][:4] == 'http':
            if len(href_split[0]) > 4 and href_split[0][4] == 's':
                href_split[0] = 'http'
            if href_split[1][:4] == "www.":
                href_split[1] = href_split[1][4:]
            href_bits = href_split[1].split('/')

            if domain in href_bits[0]:
                page_urls.append(href_split[0] + '://' + href_split[1])

        if len(href_split) == 1:
            if len(href_split[0]) > 1 and href_split[0][0] == '/':
                page_urls.append(current_url + href_split[0][1:])

    # update the queue and add an edge from current URL to all URLs connected to it
    g.add_node(
        current_url,
        page_graph)  # add node in the undirected graph for the current url

    for c_url in page_urls:
        if c_url not in urls_crawled:
            urls_crawled.add(
                c_url)  # add current url to the set of crawled urls
            url_queue.append(c_url)  # add current url to the deque
        g.add_edge(current_url, c_url, page_graph)

    soup = BeautifulSoup(page.text,
                         'html.parser')  # parser to parse url content
    content = soup.find_all(text=True)
    clean_text = eliminate_tags(content)  # to eliminate tags from the text

    for text in clean_text:
        text = text.strip()  # to remove extra spaces
        words = tokenizer(text)  # function to tokenize text

        for word in words:
            stem_word = preprocessor(word)  # function to preprocess words
            # calculating inverted index for each word
            if stem_word not in inverted_index:
                inverted_index[stem_word] = {}
                inverted_index[stem_word][current_url] = 1
                total_words[stem_word] = 1
            else:
                if current_url in inverted_index[stem_word]:
                    inverted_index[stem_word][current_url] += 1
                else:
                    inverted_index[stem_word][current_url] = 1
                total_words[stem_word] += 1