Python Preprocessor Exemples, preprocessor.Preprocessor Python Exemples

Exemple #1

0

Afficher le fichier

def preProcessTweets():
    processedTweets = []
    with open("E:/twitter10k.csv", newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')

        for row in reader:
            try:
                #print(counter)
                #print(row)
                tweet = row[-2]
                #print(tweet)
                #print('############')
                preProcessor = preprocessor.Preprocessor()
                processedTweet = preProcessor.preprocess_text(tweet)
                #print(processedTweet)
                for word in processedTweet:
                    if (word != 'http' and word != 'get' and word != 'is'
                            and word != 'ny' and word != 'lol'
                            and word != 'lol' and word != 'na' and word != 'u'
                            and word != '-' and word != 'us' and word != 'im'):
                        processedTweets.append(word)
                #print('done processing ############')
            except Exception as e:
                i = 1
                #print(str(e))

        return processedTweets

Exemple #2

0

Afficher le fichier

Fichier : imagelist_dataset.py Projet : superannotateai/building_detection

    def __init__(self,
                 image_file_paths,
                 mean_image=None,
                 bandstats_file_path=None,
                 original_size_x=650,
                 original_size_y=650,
                 input_size=256,
                 slice_count_x=1,
                 slice_count_y=1,
                 is_8_channel=True):
        super().__init__()
        # logger.info("Creating image list dataset from {} images".format(str(len(image_file_paths))))
        self.preprocessor = preprocessor.Preprocessor(
            datapath=None,
            original_size_x=original_size_x,
            original_size_y=original_size_y,
            input_size=input_size,
            slice_count_x=slice_count_x,
            slice_count_y=slice_count_y,
            is_8_channel=is_8_channel)
        # Change location of bandstats file, it will not figure out on its own.
        self.preprocessor.path_mgr.bandstats_file = bandstats_file_path
        self.image_file_paths = image_file_paths
        self.slice_count = slice_count_x * slice_count_y
        self.current_image_path = ""
        self.is_8_channel = is_8_channel
        self.preloaded_slices = {}

        # TODO(martun): later change mean substraction as a transformation.
        self.mean_image = mean_image

Exemple #3

0

Afficher le fichier

Fichier : multi.py Projet : shimon-lb/Voodoo-Mock_support_running_singletest_by_name

def voodooOneFile(fullName, inputPath, fileList):
    fullOutput = fullOutputName(fullName, inputPath)
    mkdirOf(fullOutput)
    output = ''
    try:
        output += voodoo.voodoo(input=fullName,
                                output=fullOutput,
                                pathToRemoveFromIdentifier=inputPath,
                                voodooDBFile=args.voodooDB,
                                includes=args.includePath,
                                defines=args.define,
                                trace=False,
                                preIncludes=args.preInclude)
        state = "V"
    except Exception, e:
        if str(e).find("all argume") != -1:
            raise
        inputLines = voodoo._readLinesOfFile(fullName)
        prepro = preprocessor.Preprocessor(fullName, fullOutput, inputLines,
                                           inputPath)
        output += prepro.intercepter()
        output += "\n/* The error that forced interception:\n" + \
                    str( e ).replace( "*/", "* /" ) + "\n"
        output += "\n"
        output += "Voodoo stack trace:\n" + traceback.format_exc()
        output += "*/\n"
        output += "\n"
        state = "I"

Exemple #4

0

Afficher le fichier

def main():
    pp = preprocessor.Preprocessor()
    csvs = []
    csvs.extend(pp.do_udc())
    csvs.extend(pp.do_cmdc())
    csvs.extend(pp.do_wiki())
    csvs.extend(pp.do_bdc())
    lexicon.Lexicon(csvs)

Exemple #5

0

Afficher le fichier

Fichier : main.py Projet : nvisary/Bachelor-work

def preprocessor(argv):
    mp3_path = argv[2]
    book_path = argv[3]
    print("You run preprocessor.")
    print("Path to mp3: " + mp3_path)
    print("Path to book: " + book_path)
    preprocessor = pr.Preprocessor(mp3_path, book_path, PREPROCESSOR_PATH)
    preprocessor.preprocess()

Exemple #6

0

Afficher le fichier

 def set_preprocessor(self, preprocessor_):
     if preprocessor_ is None:
         preprocessor_ = [preprocessor.Preprocessor()]
     elif type(preprocessor_) is not list:
         preprocessor_ = [preprocessor_]
     self.preprocessors = preprocessor_
     Xtrain, ytrain = self.split(self.training_data, self.target_column)
     for pp in self.preprocessors:
         pp.fit(Xtrain)
         Xtrain = self._preprocess_one(Xtrain, pp)

Exemple #7

0

Afficher le fichier

Fichier : Mess_executor.py Projet : lpratalimaffei/AutoNonBoltzmann

 def __init__(self, nominal_src, nominal_file, conditions_ls):
     self.src = nominal_src
     self.input_name = nominal_file
     self.nominal_model = preprocessor.Preprocessor(nominal_src +
                                                    nominal_file)
     self.nominal_model.clean_input()
     self.nominal_model.generate_species_classes()
     self.Temp_ls = [conditions_ls[0]]  # should be a list
     self.Pres_ls = [conditions_ls[1]]  # should be a list
     self.Energy_grid = conditions_ls[2]  # should be a float
     self.new_ne_file = []

Exemple #8

0

Afficher le fichier

    def setUp(self):
        # initial runtime environment
        args = {"config_file": "../config_omniphotos.yaml"}
        self.preprocessor = preprocessor.Preprocessor(args)

        self.preprocessor.root_dir = \
            pathlib.Path("D:/workdata/testDatasets/circular/KyotoShrines_test")
        self.preprocessor.image_output_path= \
            pathlib.Path("D:/workdata/testDatasets/circular/KyotoShrines_test/Input")
        self.preprocessor.FPS = 50
        self.preprocessor.omniphotos_config_template_path = \
            "D:/workspace/Python/preprocessing/template/config.yaml.template"

Exemple #9

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--project_dir", type=str, required=True)
    parser.add_argument("--report_dir", type=str, required=True)
    parser.add_argument("--num_files_to_print",
                        type=int,
                        required=False,
                        default=20)
    args = parser.parse_args()

    project_dir = args.project_dir
    report_dir = args.report_dir
    num_files_to_print = args.num_files_to_print

    project_dir = "../data/ZXing"
    report_dir = "../data/ZXing/ZXingBugRepository.xml"

    project_report_info = preprocessor.Preprocessor(project_dir, report_dir)
    similarity_info = similarity_calculator.SimilarityCalculator(
        project_report_info.project_frequency_dict,
        project_report_info.report_frequency_dict,
        project_report_info.xml_report, num_files_to_print)

    project_dir = "../data/Rhino"
    report_dir = "../data/Rhino/RhinoBugRepository.xml"

    project_report_info = preprocessor.Preprocessor(project_dir, report_dir)
    similarity_info = similarity_calculator.SimilarityCalculator(
        project_report_info.project_frequency_dict,
        project_report_info.report_frequency_dict,
        project_report_info.xml_report, num_files_to_print)

    project_dir = "../data/JodaTime/"
    report_dir = "../data/JodaTime/JodaTimeBugRepository.xml"

    project_report_info = preprocessor.Preprocessor(project_dir, report_dir)
    similarity_info = similarity_calculator.SimilarityCalculator(
        project_report_info.project_frequency_dict,
        project_report_info.report_frequency_dict,
        project_report_info.xml_report, num_files_to_print)

Exemple #10

0

Afficher le fichier

def preprocess_input(document,
                     lower=True,
                     remove_punctuation=False,
                     remove_stop_words=False):
    preprocessor = pp.Preprocessor()
    if lower:
        document = document.lower()
    if remove_punctuation:
        document = preprocessor.remove_punctuation(document)
    if remove_stop_words:
        document = preprocessor.remove_stop_words(document,
                                                  german=True,
                                                  english=True)
    return document

Exemple #11

0

Afficher le fichier

    def make_dataframe(self):
        test_df = pd.read_csv(self.test_path, names=['x', 'y'])

        test_data = preprocessor.Preprocessor(test_df, self.model.vocab)
        test_data.tokenize('x')
        test_data.add_tags('x')
        test_data.lemmatize('x')

        test_data.update_dataframe('x', 'y')

        test_data.data['pos_score'], test_data.data[
            'neg_score'], test_data.data['likelihood_pos'] = (
                self.model.predict(test_data.data))
        return test_data.data

Exemple #12

0

Afficher le fichier

Fichier : Mess_executor.py Projet : Simple-ape/MSI_theory

 def __init__(self,
              input_path,
              nominal_file,
              perturb_dict,
              nominal_dict,
              abstraction=False):
     self.input_path = input_path
     self.input_name = nominal_file
     self.nominal_model = preprocessor.Preprocessor(input_path +
                                                    nominal_file)
     self.nominal_model.clean_input()
     self.nominal_model.generate_species_classes(abstraction=abstraction)
     self.perturb_dict = perturb_dict
     self.nominal_dict = nominal_dict
     self.abstraction = abstraction

Exemple #13

0

Afficher le fichier

def main():
    prep = preprocessor.Preprocessor()
    # BUG:
    raw_img_data = prep.read_img_jpeg_bytes("./pedestrain.jpg")
    warm_up(prep)

    server_addr = "/tmp/coin_dl_server"
    client_addr = "/tmp/coin_dl_client"

    for addr in [client_addr, server_addr]:
        try:
            os.remove(addr)
        except OSError:
            pass

    main_loop(server_addr, client_addr, prep, raw_img_data)

Exemple #14

0

Afficher le fichier

 def warm_up(self, det, mode):
     self.logger.info("Warm-up the detector")
     start = time.time()
     raw_img_data = det.read_img_jpeg_bytes("./pedestrain.jpg")
     if mode == "raw":
         # Warm up the session, first time inference is slow
         ret = det.inference(raw_img_data)
         ret = det.get_detection_results(*ret)
     elif mode == "preprocessed":
         prep = preprocessor.Preprocessor()
         compressed_img_data = prep.inference(raw_img_data, 70)
         ret = det.inference(compressed_img_data)
         ret = det.get_detection_results(*ret)
     duration = time.time() - start
     self.logger.info(
         f"Warm-up the mode {mode} finished! Takes {duration} seconds")

Exemple #15

0

Afficher le fichier

def generate_files():
    check_nltk_resources()
    check_paths()

    p = preprocessor.Preprocessor()
    df = p.load_dataset()

    p.preprocess_synopses(df)
    p.preprocess_genres(df)
    p.build_indexes()
    if settings.USE_W2V:
        p.generate_embedding_weights()
    p.filter_dataset()
    p.encode_genres()
    p.encode_synopses()
    p.save_data()

Exemple #16

0

Afficher le fichier

def get_predictions(g, n):
    possible_genres = list(g.mlb.classes_)
    print("Possible film genres: ", ','.join(possible_genres))
    input_line = 'r'  #input("Insert a comma separated set of genres (r for random, q for quit): ")
    if input_line == 'q':
        exit()
    randomly = input_line == 'r'
    p = preprocessor.Preprocessor()
    if randomly:
        n_genres = random.randint(1, 6)
        input_genres = random.sample(possible_genres, n_genres)
    else:
        input_genres = input_line.split(',')
        for ig in input_genres:
            if ig not in possible_genres:
                print(ig + " is not a possible genre")
                get_predictions(g, n)
    print("Input genres: ", ', '.join(input_genres))
    encoded_genres = g.mlb.transform([input_genres])
    mode = input("Input g or b for greedy or beam search mode: ")
    previous_words = input("Introduce help/previous words (optional): ")
    previous_words = p.clean_text(previous_words)
    previous_words = p.tokenize(previous_words)[:-1]
    prvs = []
    for pw in previous_words:
        if pw in g.word_to_index.keys():
            prvs.append(pw)
        else:
            prvs.append(settings.UNKNOWN_TOKEN)
    if previous_words == '':
        previous_words = None
    print("Starting words: " + str(previous_words))
    if mode == 'g':
        print("Greedy search mode")
        syn = get_predictions_greedy(g, n, encoded_genres, previous_words)
    elif mode == 'b':
        print("Beam search mode")
        syn = get_predictions_beam(g=g,
                                   n=n,
                                   encoded_genres=encoded_genres,
                                   previous_words=previous_words)
    else:
        print("Wrong mode")
        get_predictions(g, n)
    print("Synopsis: ", syn)
    get_predictions(g, n)

Exemple #17

0

Afficher le fichier

Fichier : pipeline.py Projet : anwarower/CarND-Advanced-Lane-Lines

class FramePipeline:
    cam = cm.Camera()
    preprocessor = prep.Preprocessor()
    homographyOp = h**o.Homography()
    laneLinesFinder = None
    currOriginalFrame = None
    visualizer = None

    def __init__(self, frameWidth, frameHeight):
        self.frameWidth = frameWidth
        self.frameHeight = frameHeight
        self.cam.init(9, 6, 'camera_cal/calibration*.jpg')
        self.cam.calibrate()
        self.homographyOp.setFrameSize(frameWidth, frameHeight)
        self.homographyOp.estimateRoadHomography()
        self.laneLinesFinder = lf.LaneLinesFinder(frameWidth, frameHeight)
        self.visualizer = visu.Visualizer(self.laneLinesFinder, self)

    def processFrame(self, InputImg):

        self.currOriginalFrame = InputImg

        undistortedImg = self.cam.undistortImg(InputImg)

        sobelImg = self.preprocessor.extractEdges(undistortedImg, 'all')

        croppedImg = self.preprocessor.crop(sobelImg)

        rectImg = self.homographyOp.warp(croppedImg)

        warped_out = self.laneLinesFinder.findLane(rectImg)

        output = self.visualizer.visualizeFrame(rectImg)

        #only for the report at the end
        #cv2.imwrite('afterUndist.jpg', undistortedImg)
        #cv2.imwrite('afterSobel.jpg', sobelImg)
        #cv2.imwrite('afterCropping.jpg', croppedImg)
        #cv2.imwrite('afterRectifying.jpg', rectImg)
        #cv2.imwrite('afterFitting.jpg', warped_out)
        #cv2.imwrite('afterWarpingBack.jpg', output)
        #cv2.waitKey()
        return output

Exemple #18

0

Afficher le fichier

Fichier : test_preprocessor.py Projet : wdebsqi/spam-recognizer

    def __init__(self, methodName):
        super().__init__(methodName)

        self.preprocessor = preprocessor.Preprocessor()
        column_names = ['label', 'text']
        data_to_process = {
            'label': ['ham', 'ham', 'spam'],
            'text': [
                'Not normalized   #$text', '   AnOthEr not normalized TEXT',
                'Not normalized     SPAM'
            ],
        }
        self.dataset_to_process = pd.DataFrame(data_to_process,
                                               columns=column_names)

        correct_data = {
            'label': [0, 0, 1],
            'text':
            ['normalized text', 'another normalized text', 'normalized spam'],
        }
        self.correct_dataset = pd.DataFrame(correct_data, columns=column_names)

Exemple #19

0

Afficher le fichier

 def test2():
     '''
     ===========================================================
     Following steps:
         1. Add numbers and punct
         2. No tokenizer
         3. Stop words
         4. pos tag
     ===========================================================
     '''
     print('running preprocessor test 2 ...')
     pattern = re.compile(r'[^а-яА-я0-9,.!?;\- ё]')
     analyzer = maru.get_analyzer(tagger='linear')
     config = preprocessor.Config(regexp=pattern,
                                  stopwords=stopwords_set,
                                  analyzer=analyzer,
                                  with_pos_tag=True,
                                  remove_stop_words=False,
                                  lemmatize=True,
                                  tokenizer=None)
     pipeline = preprocessor.Preprocessor(config)
     case = [
         'Так говорила в июле 1805 года известная',
         '— Как можно быть здоровой... когда нравственно страдаешь?',
         'праздник отменен, Je vous avoue que toutes ces fêtes'
     ]
     expected = [[
         'так_ADV', 'говорить_VERB', 'в_ADP', 'июль_NOUN', '1805_NUM',
         'год_NOUN', 'известный_ADJ'
     ],
                 [
                     'как_CONJ', 'можно_ADJ', 'быть_VERB',
                     'здоровой..._CONJ', 'когда_CONJ', 'нравственно_ADV',
                     'страдаешь?_PRON'
                 ], ['праздник_NOUN', 'отменен,_VERB']]
     res = pipeline.fit(case).transform(case)
     for res_line, expected_line in zip(res, expected):
         assert compare(res_line, expected_line), \
             'failed with {} and {}'.format(res_line, expected_line)
     print('test 2 passed')

Exemple #20

0

Afficher le fichier

 def test4():
     '''
     ===========================================================
     Following steps:
         1. Add numbers and punct
         2. Razdel tokenizer
         3. Stop words
         4. No pos tag
         5. No lemmatization
     ===========================================================
     '''
     print('running preprocessor test 3 ...')
     pattern = re.compile(r'[^а-яА-я0-9,.!?;\- ё]')
     analyzer = maru.get_analyzer(tagger='linear')
     tokenizer = razdel
     config = preprocessor.Config(regexp=pattern,
                                  stopwords=stopwords_set,
                                  analyzer=analyzer,
                                  with_pos_tag=False,
                                  remove_stop_words=False,
                                  lemmatize=False,
                                  tokenizer=tokenizer)
     pipeline = preprocessor.Preprocessor(config)
     case = [
         'Так говорила в июле 1805 года известная',
         '— Как можно быть здоровой... когда нравственно страдаешь?',
         'праздник отменен, Je vous avoue que toutes ces fêtes'
     ]
     expected = [[
         'так', 'говорила', 'в', 'июле', '1805', 'года', 'известная'
     ],
                 [
                     'как', 'можно', 'быть', 'здоровой', '...', 'когда',
                     'нравственно', 'страдаешь', '?'
                 ], ['праздник', 'отменен', ',']]
     res = pipeline.fit(case).transform(case)
     for res_line, expected_line in zip(res, expected):
         assert compare(res_line, expected_line), \
             'failed with {} and {}'.format(res_line, expected_line)
     print('test 4 passed')

Exemple #21

0

Afficher le fichier

Fichier : detector.py Projet : stevelorenz/build-vnf

def test():
    raw_img_data = Detector.read_img_jpeg_bytes("./pedestrain.jpg")
    det = Detector(mode="raw")
    ret = det.inference(raw_img_data)
    resp = det.get_detection_results(*ret)
    print("*** Inference result of raw image!")
    print(resp)
    del det
    gc.collect(1)
    gc.collect(2)

    # Test detection of preprocessed image
    prep = preprocessor.Preprocessor()
    compressed_img_data = prep.inference(raw_img_data, 70)
    print(
        f"*** Raw image size: {len(raw_img_data)}B, preprocessed image size: {len(compressed_img_data)}B"
    )
    det = Detector(mode="preprocessed")
    ret = det.inference(compressed_img_data)
    resp_prep = det.get_detection_results(*ret)
    print("*** Inference result of preprocessed image!")
    print(resp_prep)

Exemple #22

0

Afficher le fichier

    def doc2vec(self, sentences):
        fname = get_tmpfile('doc2vec.model')
        edited_sentences = {}
        train_corpus = []
        count = 0
        for index, sentence in sentences.items():
            processed_sentence = preprocessor.Preprocessor(
                sentence).preprocessData()
            if not processed_sentence:
                continue
            else:
                tokens = gensim.utils.simple_preprocess(processed_sentence)
                train_corpus.append(TaggedDocument(tokens, str(count)))
                edited_sentences[count] = sentence
                count = count + 1

        model = Doc2Vec(train_corpus,
                        vector_size=10,
                        dbow_words=1,
                        dm=1,
                        window=2,
                        min_count=2)
        return (model, train_corpus, edited_sentences)

Exemple #23

0

Afficher le fichier

 def test1():
     '''
     ===========================================================
     Full house:
         1. Leave only alphabet characters
         2. Remove stop words
         3. Lemmatize and add pos tags
     ===========================================================
     '''
     print('running preprocessor test 1 ...')
     pattern = re.compile(r'[^а-яА-я ё]')
     analyzer = maru.get_analyzer(tagger='linear')
     config = preprocessor.Config(regexp=pattern,
                                  stopwords=stopwords_set,
                                  analyzer=analyzer,
                                  with_pos_tag=True,
                                  remove_stop_words=True,
                                  lemmatize=True,
                                  tokenizer=None)
     pipeline = preprocessor.Preprocessor(config)
     case = [
         'Так говорила в июле 1805 года известная',
         '— Как можно быть здоровой... когда нравственно страдаешь?',
         'праздник отменен, Je vous avoue que toutes ces fêtes'
     ]
     expected = [
         ['говорить_VERB', 'июль_NOUN', 'год_NOUN', 'известный_ADJ'],
         ['здоровый_ADJ', 'нравственно_ADV', 'страдать_VERB'],
         ### here is the case when lemmatization fails
         ['праздник_NOUN', 'отменный_ADJ']
     ]
     res = pipeline.fit(case).transform(case)
     for res_line, expected_line in zip(res, expected):
         assert compare(res_line, expected_line), \
             'failed with {} and {}'.format(res_line, expected_line)
     print('test 1 passed')

Exemple #24

0

Afficher le fichier

 def getNews(self):
     links = self.getGoogleLinks()
     print(len(links))
     news = {}
     for item in range(self.number):
         # Get the text of article
         date = int(links[item][0])
         news[date] = {}
         link = links[item][1]
         article = requests.get(link)
         soup = BeautifulSoup(article.text, "html.parser")
         for script in soup(["script", "style", "meta", "noscript"]):
             script.extract()  # rip it out
         text = soup.get_text()
         # Get the source
         source_1 = re.search('\.\\s*([^.]*)', link).group(1)
         source_2 = re.search('//\\s*([^.]*)', link).group(1)
         if "/" in source_1:
             source = source_2
         else:
             source = source_1
         news[date]['source'] = source
         news[date]['text'] = preprocessor.Preprocessor(text).preprocessData()
     return news

Exemple #25

0

Afficher le fichier

Fichier : multi.py Projet : philippeqc/Voodoo-Mock

    try:
        output += voodoo.voodooExpectSource(
            input=fullName,
            output=fullOutput,
            pathToRemoveFromIdentifier=inputPath,
            voodooDBFile=args.voodooDB,
            includes=args.includePath,
            defines=args.define,
            trace=False,
            preIncludes=args.preInclude)
        state = "V"
    except Exception, e:
        if str(e).find("all argume") != -1:
            raise
        inputLines = voodoo._readLinesOfFile(fullName)
        prepro = preprocessor.Preprocessor(fullName, fullOutput, inputLines,
                                           inputPath)
        output += prepro.intercepter()
        output += "\n/* The error that forced interception:\n" + \
                    str( e ).replace( "*/", "* /" ) + "\n"
        output += "\n"
        output += "Voodoo stack trace:\n" + traceback.format_exc()
        output += "*/\n"
        output += "\n"
        state = "I"
    f = file(fullOutput, "w")
    f.write(output)
    f.flush()
    f.close()

    sys.stdout.write("  <%d/%d> %s  %s\n" % (1 + fileList.index(
        (fullName, inputPath)), len(fileList), state, fullOutput))

Exemple #26

0

Afficher le fichier

import csv
import matplotlib.pyplot as plt
import pandas as pd

from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import EarlyStopping

import preprocessor
import featureanalysis

# Above this value, the survival flag will be true
PROBABILITY_MARGIN_SURVIVAL = 0.5

prepr = preprocessor.Preprocessor()
prepr.process_training_dataset('train.csv')

df = pd.read_csv('train.csv')

# perform feature analysis
numerical_features = ["Survived", "SibSp", "Parch", "Age", "Fare"]
feat_analysis = featureanalysis.FeatureAnalysis(df)
feat_analysis.get_correlation_numerical_values(numerical_features)

# removed cabin and name columns
input_value, output = prepr.get_train_datasets()

# Get number of columns in training data
n_cols = input_value.shape[1]

Exemple #27

0

Afficher le fichier

import sys

import preprocessor

CMD_OVERWRITE_OPTION = '-ow'

if __name__ == "__main__":
    # Check arguments
    if len(sys.argv) >= 3:
        # At least 2 arguments have been passed
        inp = sys.argv[1]
        out = sys.argv[2]
        # Define overwrite option
        overwrite = len(sys.argv) >= 4 and sys.argv[3] == CMD_OVERWRITE_OPTION
        p = preprocessor.Preprocessor(preprocessor.Language.vietnamese)
        try:
            p.preprocess_files(inp, out, {'overwrite': overwrite})
        except (FileNotFoundError, FileExistsError) as errors:
            for e in errors.args:
                if e:
                    print(e)
    else:
        print('Missing arguments. Arguments: input output [-ow]')

Exemple #28

0

Afficher le fichier

Fichier : testfile.py Projet : Leopaexd/LIN503_Project

from extractor import Extractor
import preprocessor
import dictionary
import vectorizer
import dataset_divider
import classifier
import time

start = time.time()
PreProcessor = preprocessor.Preprocessor()
Dictionary = dictionary.Dictionary()
categories = [1, 2, 3]  # Categories to be includes

lines = []
for category in categories:
    lines.append(
        Extractor.extract(('flashback' + str(category) + '.json'),
                          ('extracted' + str(category) + '.txt')))
    dataset_divider.Divider.divide(('extracted' + str(category) + '.txt'),
                                   lines[len(lines) - 1])

# pre-processing of training data
processed = []
processed_test = []
for category in categories:
    processed.append(
        PreProcessor.preprocess('training' + str(category) + ".txt"))
    processed_test.append(
        PreProcessor.preprocess('testing' + str(category) + ".txt"))

with open("testingposts.txt", "w") as file:

Exemple #29

0

Afficher le fichier

import argparse

import preprocessor
from definitions import TEST_PROCESSED_PATH, TRAIN_PROCESSED_PATH


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', '--train', action='store_true')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    preprocess = preprocessor.Preprocessor(train=args.train, dl=False)
    preprocess_data = preprocess.clean_data()
    path = TRAIN_PROCESSED_PATH if args.train else TEST_PROCESSED_PATH
    preprocess_data.to_csv(path, encoding='utf-8', index=False)

Exemple #30

0

Afficher le fichier

Fichier : main.py Projet : satyaborg/stylometric-analyser

def main():
    """Main method for controlling the flow of the stylometric analyser.

    Function for creating of objects for word, character, punctuation, word length
    etc analysis.= to determine the patterns of styles in different works.

    """

    #Column names
    colnames = ['work', 'char_freq', 'punc_freq', 'stop_freq', 'word_len_freq']
    #Initializing an empty dataframe to store all stats after analysis
    all_text_stats = pd.DataFrame(columns=colnames)

    #Try block
    try:
        #-----------------------------Analysis----------------------------------
        #Main loop for doing the analysis file by file
        for work in works:
            #calling read_input function to read the content of each file
            content = read_input(work)

            #Creating object for preprocessor class
            pre_processor = prpscr.Preprocessor()
            pre_processor.tokenise(content)
            #Fetching the tokens
            tokens = pre_processor.get_tokenised_list()

            #Creating object for CharacterAnalyser class
            char_analyser = char.CharacterAnalyser()
            #Analysing at character level
            char_analyser.analyse_characters(tokens)
            #Fetching the character occurences
            ch_occ = char_analyser.char_occ
            #Fetching the punctuation occurences
            punc_occ = char_analyser.get_punctuation_frequency()

            #Creating object for WordAnalyser class
            word_analyser = word.WordAnalyser()
            #Analysing at word level
            word_analyser.analyse_words(tokens)
            #Fetching the stop word occurences
            stop_occ = word_analyser.get_stopword_frequency()
            #Fetching the word length occurences
            word_len_occ = word_analyser.get_word_length_frequency()

            #Temporary df to store all the analysis for one text at a time
            temp_df = pd.DataFrame(
                [[work, ch_occ, punc_occ, stop_occ, word_len_occ]],
                columns=colnames)

            all_text_stats = all_text_stats.append(temp_df, ignore_index=True)

        #-----------------------------Visualisation-----------------------------
        #Creating object for Visualiser class
        visualiser = vis.AnalysisVisualiser(all_text_stats)
        #Visualising punctuation frequencies in all the works
        visualiser.visualise_punctuation_frequency()
        #Visualising character frequencies in all the works
        visualiser.visualise_character_frequency()
        #Visualising stopword frequencies in all the works
        visualiser.visualise_stopword_frequency()
        #Visualising word length frequencies in all the works
        visualiser.visualise_word_length_frequency()

    #Catch for exceptions
    except ImportError as err:
        print(
            'IMPORT ERROR :', err,
            '. Please check the working directory, name or ' +
            'make sure that module is imported!')
    except TypeError as err:
        print('TYPE ERROR :', err)
    except IndexError as err:
        print('INDEX ERROR :', err)
    except ValueError as err:
        print('VALUE ERROR :', err)
    except IOError as err:
        print('INPUT ERROR :', err, '. Please check the path of the file!')
    except requests.RequestException as err:
        print('REQUEST ERROR :', err)
    except:
        print('UNEXPECTED ERROR!')