コード例 #1
0
def preprocess_savefig(root, progressbar, paths, params, type):

    if not os.path.isdir(os.path.join(root, "preprocessed")):
        for x in ["individual", "joined"]:
            for y in ["healthy", "defective"]:
                os.makedirs(os.path.join(root, "preprocessed", x, y))
    
    progressbar["value"] = 0
    progressbar["maximum"] = len(paths)

    _, img_sample = preprocess(img_path=paths[0][0], params=params)
    shape = img_sample.shape
    imgs = []
    for p in range(len(paths)):
        img_join = np.zeros((shape[0]*2, shape[1], shape[2]), dtype='uint8')
        for n in range(2):
            _, img_np = preprocess(img_path=paths[p][n], params=params)
            img_join[n*shape[0]: (n+1)*shape[0]] = img_np
            img_pil = Image.fromarray(img_np)
            img_pil.save(os.path.join(root, "preprocessed", "individual", type, f"bean{p+1}_side{n+1}.png"))
        img_pil = Image.fromarray(img_join)
        img_pil.save(os.path.join(root, "preprocessed", "joined", type, f"bean{p+1}.png"))

        progressbar["value"] += 1
        progressbar.update()
    progressbar["value"] = 0
    progressbar.update()
コード例 #2
0
def main():
    train_file ="/Users/phx/downloads/competetion/recipe/train.json"
    with open(train_file) as file:
        data = json.load(file)
    print("size of dataset %d" % len(data))

    data = preprocess(data)
    data = preprocess(data)

    train_data = [data[i] for i in xrange(0,len(data)) if i%3 !=0]
    test_data = [data[i] for i in xrange(0,len(data)) if i%3 ==0]

    #test_data= preprocess(test_data)

    attribute_map = getAttributeMap(train_data,1)

    print('attribute number : %d' % len(attribute_map))
    print(attribute_map)


    label_map = getLabelMap(data)
    print('label number : %d' %len(label_map))
    print(label_map)
    X,y = getDataSet(train_data,attribute_map,label_map)
    testX,testY= getDataSet(test_data,attribute_map,label_map)
    sgd = SGDClassifier(loss='log')
    generate_save_proba(sgd,X,y,testX,testY,"SGDClassifier.loss_log")
    mnb = MultinomialNB(alpha=0.08, class_prior=None, fit_prior=True)
    generate_save_proba(mnb,X,y,testX,testY,"MultinomialNB.alpha_0.08")
    rf = RandomForestClassifier(n_estimators=500)
    generate_save_proba(rf,X,y,testX,testY,"RandomForestClassifier.n_estimators_500")
    """
コード例 #3
0
def run_extractor():
    """Run the full extraction pipeline"""

    subprocess.call('mkdir data/reviews', shell=True)
    subprocess.call('mkdir data/tagged', shell=True)
    subprocess.call('mkdir data/untagged', shell=True)
    subprocess.call('mkdir data/to_parse', shell=True)
    subprocess.call('mkdir data/parsed/', shell=True)

    preprocessor.preprocess()

    subprocess.call('javac -cp ./external/stanford-postagger.jar Tagger.java',
                    shell=True)
    subprocess.call(
        'java -cp ".:./external/stanford-postagger.jar" -Xmx1024m Tagger ./external/left3words-wsj-0-18.tagger data/reviews data/tagged data/untagged',
        shell=True)

    multiword_attr_identifier.identify_multiword_attrs()
    parser_preparation.pre_parse()
    parser.parse_parallel(4)
    extraction_generator.generate_extractions()
    common_extraction_generator.generate_common_extractions()
    attribute_classifier.classify()
    extraction_filterer.filter_extractions()
    polarity_computer.compute_polarities()
コード例 #4
0
def main():

    choice = input(
        "Do you want to clean the original the files first ? (Y/N) \n")
    if choice == 'y' or choice == 'Y':
        i = 0

        # To get the list of files avoiding the hidden files that may start with '.' or '~'
        original_filelist = [
            f for f in listdir(original_filepath)
            if not (f.startswith('.') or f.startswith('~'))
        ]

        for filename in original_filelist:
            i += 1
            print(filename)
            preprocess(filename)
            print(i, "file cleaning done")

    choice = input("Do you want to process the cleaned files ? (Y/N) \n")

    if choice == 'y' or choice == 'Y':

        for size in windowsizes:
            window_generator(size)

    perform_calc()
コード例 #5
0
ファイル: main.py プロジェクト: spectator05/capstone-2021-18
def ip_16_32_count(file_name):
    # adding ip count, network(IP/16bit) count to 'stat_dict' by preprocess.py
    stat_dict = {'ip':{}, 'network':{}}
    for fname in tqdm(file_name, total=len(file_name)):
            with open(rf"{data_path}{os.sep}{fname}", "rb") as file:
                pk = pickle.load(file)
                preprocess(pk, stat_dict)
    return stat_dict
コード例 #6
0
def get_input(image, boxes):
    images = get_cropped_images(boxes, image)
    preprocessed_images = [
        normalize(images),
        preprocess(images, histogram_stretching),
        preprocess(images, histogram_equalization), boxes
    ]
    return preprocessed_images
コード例 #7
0
def get_input(image, boxes):
    images = get_cropped_images(boxes, image)
    adeq_images = preprocess(np.array([image]),
                             adaptive_histogram_equalization)[0]
    preprocessed_images = [
        preprocess(images, histogram_equalization),
        preprocess(images, histogram_stretching),
        get_cropped_images(boxes, adeq_images), boxes
    ]
    return preprocessed_images
コード例 #8
0
def get_preprocessed_images(images):
    images = [
        normalize(images),
        preprocess(images, histogram_stretching),
        preprocess(images, histogram_equalization)
    ]
    images = [
        np.array([resize(img, (256, 256)) for img in imgs]) for imgs in images
    ]
    return images
コード例 #9
0
def process_link(link):
    """
    Processes the given link, does some noise removal
    and return the detailed page in form of a HTMLNode object
    :param str link:
    :return: list of HTMLNode
    """
    website = constants.website

    # Build the right website link
    if link[0] == '/':
        if re.search('(^((http[s]{0,1}://)?www\.)?.+\.[a-z]+)/', website) is None:
            constants.logger.error('Unknown website link format')
            return None
        else:
            site_pref = re.findall('(^((http[s]{0,1}://)?www\.)?.+\.[a-z]+)/', website)[0][0]
            website = site_pref + link
    else:
        website = link

    # Launch website and get HTML code
    try:
        response = urllib.request.urlopen(website)
    except urllib.error.URLError:
        constants.logger.error('Page:"%s" was not able to launch' % website)
        return None
    source = response.read().decode('latin-1')

    # Transfer HTML code via easyhtml
    dom_parser = parser.DOMParser()
    dom_parser.feed(str(source))
    document = dom_parser.get_dom()

    # Finding the html node
    html_object = None
    for node in document.elements:
        if isinstance(node, easyhtml.dom.HTMLTag):
            if node.tag_name == 'html':
                html_object = node
                break
    if html_object is None:
        constants.logger.error('No html tag was found on detailed page')
        return None

    # Transforming the dom tree into the built in data objects
    # of HTMLNodes
    detailed_page = HTMLNode(html_object, 0)

    # preprocessing and noise removal
    preprocessor.preprocess(detailed_page)
    preprocessor.remove_noise_dp(detailed_page)

    # Finding and returning main text
    return detailed_page
コード例 #10
0
def get_inputs(images, boxes):
    cropped_images = np.array([imgs for i in range(0, len(images)) for imgs in get_cropped_images(boxes[i], images[i])])
    flattened_boxes = np.array([values for _boxes in boxes for values in _boxes])

    preprocessed_images = [
        normalize(cropped_images),
        preprocess(cropped_images, histogram_stretching),
        preprocess(cropped_images, histogram_equalization),
        flattened_boxes
    ]
    return preprocessed_images
コード例 #11
0
ファイル: prequeue.py プロジェクト: katlabs/pokecrystal
def main():
    macros = preprocessor.load_pokecrystal_macros()
    macro_table = preprocessor.make_macro_table(macros)

    stdout = sys.stdout

    for source in sys.argv[1:]:
        dest = os.path.splitext(source)[0] + '.tx'
        sys.stdin = open(source, 'r')
        sys.stdout = open(dest, 'w')
        preprocessor.preprocess(macro_table)

    # reset stdout
    sys.stdout = stdout
コード例 #12
0
ファイル: prequeue.py プロジェクト: Sanqui/pokecrystal
def main():
    macros = preprocessor.load_pokecrystal_macros()
    macro_table = preprocessor.make_macro_table(macros)

    stdout = sys.stdout

    for source in sys.argv[1:]:
        dest = os.path.splitext(source)[0] + '.tx'
        sys.stdin  = open(source, 'r')
        sys.stdout = open(dest, 'w')
        preprocessor.preprocess(macro_table)

    # reset stdout
    sys.stdout = stdout
コード例 #13
0
def get_num_episodes():
    while True:
        try:
            with open(info_filename, 'r') as f:
                lines = f.readlines()
                #last line is blank.
                num_eps = len(lines) - 1
                assert num_eps > 0
                break
        except:
            print('preprocessing...')
            preprocessor.preprocess()
    print('preprocessing completed')
    return num_eps
コード例 #14
0
ファイル: prequeue.py プロジェクト: MrWint/pokecrystal
def main():
    config = configuration.Config()
    macros = preprocessor.load_pokecrystal_macros()

    stdout = sys.stdout

    for source in sys.argv[1:]:
        dest = os.path.splitext(source)[0] + '.tx'
        sys.stdin  = open(source, 'r')
        sys.stdout = open(dest, 'w')
        preprocessor.preprocess(config, macros)

    # reset stdout
    sys.stdout = stdout
コード例 #15
0
def improved_indexer(documents):
    index = {}
    m = len(documents)
    doc_lengths = {}

    stopword_file = open(os.path.join(os.path.dirname(__file__), 'stopword_list.txt'), 'r')
    stopword_list = []

    for line in stopword_file:
        stopword_list.append(line.rstrip())

    termlist = {}
    for recordnum in documents:
        document = documents[recordnum]['text']
        doc_lengths[recordnum] = []

        for (i, field) in enumerate(document):
            priority = i
            tokens = preprocess(field, stopword_list)
            for token in tokens:
                if (token, priority) in index:
                    if recordnum in index[(token, priority)]:
                        index[(token, priority)][recordnum] += 1
                    else:
                        index[(token, priority)][recordnum] = 1
                else:
                    index[(token, priority)] = {recordnum: 1}

                if token in termlist:
                    if recordnum not in termlist[token]:
                        termlist[token].append(recordnum)
                else:
                    termlist[token] = [recordnum]

            doc_lengths[recordnum].append(len(tokens))

    all_doc_lengths = [doc_lengths[recordnum] for recordnum in doc_lengths]
    doc_lengths_avg = numpy.average(numpy.matrix(all_doc_lengths), axis=0).tolist()[0]

    doc_lengths['avg'] = doc_lengths_avg

    enhanced_index = {}
    d = defaultdict(list)

    for word, priority in index:
        d[word].append(priority)
    terms = dict((k, v) for (k, v) in d.items())

    for word in terms:

        docs_with_word = termlist[word]
        idf = log10((m+1.0) / len(docs_with_word))

        for priority in terms[word]:
            enhanced_index[(word, priority)] = {}

            for document in index[(word, priority)]:
                enhanced_index[(word, priority)][document] = [index[(word, priority)][document], idf]

    return [enhanced_index, doc_lengths]
コード例 #16
0
def process():

    # Mengakses data form dari request HTTP
    text = request.form.get("text", "")

    # Melakukan preprocessing
    text = preprocess(text)

    # Melakukan tagging
    text = tag(text, "http://localhost:7000")

    # Melakukan chunking
    text = chunk(text)

    # Melakukan proses normalisasi
    text = normalize(text)

    # Membuat response HTTP dengan format JSON yang berisi teks yang telah diproses
    return jsonify({
        "status": "success",
        "message": "Request successful",
        "data": {
            "text": text
        }
    })
コード例 #17
0
def get_topic_sentiment_nltk(topic_keywords):
    topic_max_distance = [0]
    topic_min_distance = [0]
    topic_sarcastic = False

    for topic_keyword in topic_keywords:
        tweets = get_tweets_for_feature_extraction(topic_keyword, 3)

        tweets_positive = [0]
        tweets_negative = [0]
        tweets_sarcastic = False
        for tweet in tweets:
            processed_tweet = preprocess(tweet["text"])
            processed_text = processed_tweet["text"]

            tokens = nltk.word_tokenize(processed_text)
            tokens = [(t.lower()) for t in tokens]

            mean_sentiment = sentiment_helper.score_sentence(tokens)
            positive_sentence_sentiment = mean_sentiment[0]
            negative_sentence_sentiment = mean_sentiment[1]

            tweets_positive.append(positive_sentence_sentiment)
            tweets_negative.append(negative_sentence_sentiment)
            tweets_sarcastic = ("#sarcasm" in processed_tweet["hashtags"]) or tweets_sarcastic

        topic_max_distance.append(max(tweets_positive) - min(tweets_positive))
        topic_min_distance.append(max(tweets_negative) - min(tweets_negative))
        topic_sarcastic = topic_sarcastic or tweets_sarcastic

    return sum(topic_max_distance) / (len(topic_keywords) or 1), sum(topic_min_distance) / (len(topic_keywords) or 1), int(topic_sarcastic)
コード例 #18
0
ファイル: codeprocessor.py プロジェクト: nzbr/pandocode
def process_line(line):
    if line.strip() == '':  # Don't process empty lines any further
        if cCountEmptyLines:
            return "\\State", None, False, 0
        else:
            return "\\Statex", None, False, 0

    sp = line.split("#")
    comment = ""
    if len(sp) > 1:
        if len(sp[-2]) == 0 or not sp[-2][-1] == "\\":
            comment = sp[-1]
            line = "\\#".join(sp[:-1])
        else:
            if not len(sp[-2]) == 0:
                sp[-2] = sp[-2][:-1]
            line = "\\#".join(sp)

    comment = comment.strip()
    line = line.strip()
    line = preprocess(line)

    terminator = None
    process_lvl = False
    transform = 0
    if line == "":
        line = generate_comment_line(comment)
    else:
        keyword = get_keyword(line)
        generator = get_generator(keyword)
        line, terminator, process_lvl, transform = generator(line)
        if not comment == "":
            line += " \\Comment{\ " + comment + "}"

    return line, terminator, process_lvl, transform  # Add generated line to result
コード例 #19
0
def telemetry(sid, data):
    if data:
        # The current steering angle of the car
        steering_angle = data["steering_angle"]
        # The current throttle of the car
        throttle = data["throttle"]
        # The current speed of the car
        speed = data["speed"]
        # The current image from the center camera of the car
        imgString = data["image"]
        image = Image.open(BytesIO(base64.b64decode(imgString)))
        image_array = np.asarray(image)
        image_array = preprocessor.preprocess(image_array)
        steering_angle = float(
            model.predict(image_array[None, :, :, :], batch_size=1))

        throttle = controller.update(float(speed))

        print(steering_angle, throttle)
        send_control(steering_angle, throttle)

        # save frame
        if args.image_folder != '':
            timestamp = datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S_%f')[:-3]
            image_filename = os.path.join(args.image_folder, timestamp)
            image.save('{}.jpg'.format(image_filename))
    else:
        # NOTE: DON'T EDIT THIS.
        sio.emit('manual', data={}, skip_sid=True)
コード例 #20
0
def run_configs(data_dir, reviews_filename):
    # directory where the preprocessed files will be stored
    preprocessed_dir = data_dir + "preprocessed_files/"

    # directory of raw data eg. {root}/data/electronics/reviews_Electronics_5
    filename = data_dir + reviews_filename

    # file endings
    raw = filename + ".json.gz"
    reviews = filename + "_reviews.txt"
    ratings = filename + "_ratings.npy"

    # possible preprocessing steps
    preprocess_steps = {
        "reg_lemma":
        ["clean", "regexp_tokenize", "remove_stop_words", "lemmatize"],
        # "reg_stem": ["clean", "regexp_tokenize", "remove_stop_words", "stem"],
        # "tw_lemma": ["clean", "tweet_tokenize", "remove_stop_words", "lemmatize"],
        # "tw_stem": ["clean", "tweet_tokenize", "remove_stop_words", "stem"],
    }

    for step in preprocess_steps:
        # generate a new filename eg. {root}/data/preprocessed_files/electronics/reviews_Electronics_5_tw_stem.txt
        preprocessed_filename = preprocessed_dir + filename.replace(
            data_dir, "") + "_" + step + ".txt"
        # if given file does not exist, preprocess input file with given steps and save it
        if not os.path.isfile(preprocessed_filename):
            preprocessed_texts = preprocessor.preprocess(
                reviews, preprocess_steps[step])
            preprocessor.save_texts(preprocessed_texts, preprocessed_filename)
コード例 #21
0
ファイル: recommender.py プロジェクト: gacha17/health-engine
def recommend(inputs):
    recommendation_list = []
    all_recommendation = analyzer.recommend_start(inputs)
    features = preprocessor.preprocess(inputs)
    recommendations = lookup_table.lookup(features)
    recommendations.append(all_recommendation)
    return all_recommendation
コード例 #22
0
ファイル: __init__.py プロジェクト: rajan-garg/ALL-8085
    def on_start_file_chooser_button_clicked(self, widget):
        window = self.shell_ui.get_object("all_window")
        dialog = Gtk.FileChooserDialog(
            title="Please choose a file",
            parent=window,
            action=Gtk.FileChooserAction.OPEN,
            buttons=(Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, Gtk.STOCK_OPEN,
                     Gtk.ResponseType.OK))
        response = dialog.run()
        if response == Gtk.ResponseType.OK:
            selected_file_path = dialog.get_filename()
            relative_path = os.path.basename(selected_file_path)
            inputfile = open(relative_path, "r")
            code = inputfile.read()
            lines = code.split('\n')
            finalfile = lines[0].split('.')[0] + '.8085'
            print(lines[0].split('.')[0])
            print(finalfile)

            entries_box = self.shell_ui.get_object("start_entries_box")
            wids = entries_box.get_children()
            for widget in wids:
                widget.destroy()
            i = 0
            print(lines)
            for line in lines:
                if line != '':
                    self.z.append(line)
                    label = Gtk.Label("Code" + str(i))
                    tv = Gtk.TextView()
                    tb = tv.get_buffer()
                    entries_box.add(label)
                    entries_box.add(tv)
                    i += 1
                    with open(line, "r") as file:
                        s = file.read()
                        tb.set_text(s)
                        print(s)
            self.shell_ui.get_object("start_entry_number_entry").set_text(
                str(i))
            entries_box.show_all()
            self.x = preprocess(self.z)
            processed_box = self.shell_ui.get_object("processed_box")
            i = 0
            for file_name in self.x:
                if file_name != '':
                    label = Gtk.Label("Code" + str(i))
                    tv = Gtk.TextView()
                    tb = tv.get_buffer()
                    processed_box.add(label)
                    processed_box.add(tv)
                    i += 1
                    with open(file_name, "r") as file:
                        s = file.read()
                        tb.set_text(s)
                        print(s)
            processed_box.show_all()
        elif response == Gtk.ResponseType.CANCEL:
            print("Cancel clicked")
        dialog.destroy()
コード例 #23
0
ファイル: attitydlig.py プロジェクト: Leopaexd/attitydlig
def translated_data():
    directory = 'C:\\Users\\olive\\Desktop\\Datasets_for_thesis\\Prisjakt\\training_data'
    extracted_data = extractor.json_extract(directory)
    extracted_reviews = extracted_data[0]
    polarities = extracted_data[1]
    preprocessed_reviews = preprocessor.preprocess(extracted_reviews)
    dictionary = dict.Dictionary(preprocessed_reviews).dictionary
    # review_translator.translate_reviews(preprocessed_reviews, polarities)
    with open('untranslated_reviews validation combined.txt', 'r') as file:
        untranslated_reviews = np.concatenate(
            vectorizer.vectorize_data(
                preprocessor.preprocess(file.readlines()), dictionary, 300))
    with open('translated_polarities validation combined.txt', 'r') as file:
        translated_polarities = []
        for line in file:
            translated_polarities.append(int(line))
    return [untranslated_reviews, np.array(translated_polarities)]
コード例 #24
0
def km(num):
    
    num = int(num)
    data = preprocess('data.txt', None , [] )
    X_principal = xnormalize(data)
    km_name = kmeans_cluster(X_principal , num)

    return {"figure": "cluster/" + km_name}
コード例 #25
0
    def test_success(self):
        list_docs = [
            'Hôm nay, tôi đi học. 12321 ', 'Hôm nay, trời 432 đẹp quá!'
        ]
        list_docs = preprocessor.preprocess(list_docs)

        transformer = Text2Vector()
        transformer.fit(list_docs)

        print('Most comment words: ', transformer.get_most_common(10))

        vec = transformer.doc_to_vec(
            preprocessor.preprocess(
                ['Hôm nay, tôi 332 đi học.', 'Hôm nay, 43 tôi đi chơi.!']))
        print('Vec: ', vec)
        text = transformer.vec_to_doc(vec)
        print('Text: ', text)
コード例 #26
0
 def __init__(self, content):
     self.content = content
     self.sents = preprocess(content)
     self.word2count = self.countword()
     self.k1 = 1.50
     self.b = 0.75
     self.stopWords = stopwords.words('english')
     self.title = None
コード例 #27
0
 def test_preprocess_with_prefix_and_suffix(self):
     parts = {'qwe', 'wer'}
     prefix, suffix = 'prefix', 'suffix'
     s = ''.join((prefix, '{', '|'.join(parts), '}', suffix))
     result = set(preprocess(s))
     expected = set(''.join((prefix, '{', value, '}', suffix))
                    for value in parts)
     self.assertEqual(expected, result)
コード例 #28
0
def main(range_map_geodatabase_path, layer_name,
         forest_dependency_spreadsheet_path, global_canopy_cover_thresh,
         aoo_canopy_cover_thresh, altitude_limits_table_path,
         generation_lengths_table_path):
    """This function is the core of the application. It performs the pre-processing,
    analysis and post-processing.

    :param range_map_geodatabase_path: Path to an ESRI file geodatabase containing range
        maps to be analysed. See README for required format.
    :param layer_name: Name of the layer in the geodatabase at geodatabase_path
        containing the range maps to be analysed.
    :param forest_dependency_spreadsheet_path: Path to a spreadsheet containing species'
        forest dependency information. See README for required format.
    :param global_canopy_cover_thresh: Pixels in the "treecover2000" layer with an
        intensity less than this threshold are excluded from all computations: they
        are not counted as tree cover.
    :param aoo_canopy_cover_thresh: 2km by 2km grid cells containing a proportion of
        tree cover greater than aoo_canopy_cover_thresh are counted as forested cells
        for the purpose of AOO estimation.
    :param altitude_limits_table_path: Path to a CSV file containing species' minimum
        and maximum altitudes. See README for required format.
    :param generation_lengths_table_path: Path to a CSV file containing species'
        generation lengths. See README for required format.
    :return:
    """
    # Google Cloud Platform authentication.
    os.system('gcloud auth login')
    # Google Earth Engine authentication.
    ee.Authenticate()

    ee.Initialize()

    range_map_ic_gee_path = preprocess(range_map_geodatabase_path, layer_name,
                                       forest_dependency_spreadsheet_path)

    print_w_timestamp('Waiting for all GEE tasks to complete...')
    wait_until_all_tasks_complete()
    print_w_timestamp('Done.')

    if global_canopy_cover_thresh:
        if aoo_canopy_cover_thresh:
            analyse(altitude_limits_table_path, range_map_ic_gee_path,
                    global_canopy_cover_thresh, aoo_canopy_cover_thresh)
        else:
            analyse(altitude_limits_table_path, range_map_ic_gee_path,
                    global_canopy_cover_thresh)
    else:
        if aoo_canopy_cover_thresh:
            analyse(altitude_limits_table_path, range_map_ic_gee_path,
                    aoo_canopy_cover_thresh)
        else:
            analyse(altitude_limits_table_path, range_map_ic_gee_path)

    print_w_timestamp('Waiting for all GEE tasks to complete...')
    wait_until_all_tasks_complete()
    print_w_timestamp('Done.')

    postprocess(generation_lengths_table_path)
コード例 #29
0
    def input(self, s):
        self.lexer.lineno = ExtendedLineNo(1, 0)
        self.errors = []
        self.lexer.errors = self.errors
        self.token_gen = self.generator(self.lexer.token)

        ps = preprocess(s)

        return self.lexer.input(ps)
コード例 #30
0
ファイル: analyze.py プロジェクト: psambit9791/NewsGenerator
def genSent():
    objPre = preprocess()
    objPre = objPre.load()
    sentences = obj.generateSent(objPre.word_to_index, 1000,
                                 objPre.index_to_word)
    print sentences[:5]
    print "writing " + str(len(sentences)) + " news"
    write_line = '\n'.join(sentences)
    open(FILE_NAME + '_sentences', 'w').write(write_line.encode('utf-8'))
コード例 #31
0
ファイル: worker.py プロジェクト: abloch/yap_worker
def process_message(msg, channel=None):
    slice = json.loads(msg)
    text = preprocess(slice['text'])
    reply = None
    yap_reply = get_yap(text)
    slice['tokens'] = get_tokens(yap_reply)
    slice['raw_yap'] = yap_reply
    submit_yapped(slice, channel)
    all_processed.append(slice)
コード例 #32
0
ファイル: __init__.py プロジェクト: theawless/PALLS-8085
    def on_start_file_chooser_button_clicked(self, widget):
        window = self.shell_ui.get_object("all_window")
        dialog = Gtk.FileChooserDialog(title="Please choose a file", parent=window, action=Gtk.FileChooserAction.OPEN,
                                       buttons=(
                                           Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, Gtk.STOCK_OPEN,
                                           Gtk.ResponseType.OK))
        response = dialog.run()
        if response == Gtk.ResponseType.OK:
            selected_file_path = dialog.get_filename()
            relative_path = os.path.basename(selected_file_path)
            inputfile = open(relative_path, "r")
            code = inputfile.read()
            lines = code.split('\n')
            finalfile = lines[0].split('.')[0] + '.8085'
            print(lines[0].split('.')[0])
            print(finalfile)

            entries_box = self.shell_ui.get_object("start_entries_box")
            wids = entries_box.get_children()
            for widget in wids:
                widget.destroy()
            i = 0
            print (lines)
            for line in lines:
                if line != '':
                    self.z.append(line)
                    label = Gtk.Label("Code" + str(i))
                    tv = Gtk.TextView()
                    tb = tv.get_buffer()
                    entries_box.add(label)
                    entries_box.add(tv)
                    i += 1
                    with open(line, "r") as file:
                        s = file.read()
                        tb.set_text(s)
                        print(s)
            self.shell_ui.get_object("start_entry_number_entry").set_text(str(i))
            entries_box.show_all()
            self.x = preprocess(self.z)
            processed_box = self.shell_ui.get_object("processed_box")
            i = 0
            for file_name in self.x:
                if file_name != '':
                    label = Gtk.Label("Code" + str(i))
                    tv = Gtk.TextView()
                    tb = tv.get_buffer()
                    processed_box.add(label)
                    processed_box.add(tv)
                    i += 1
                    with open(file_name, "r") as file:
                        s = file.read()
                        tb.set_text(s)
                        print(s)
            processed_box.show_all()
        elif response == Gtk.ResponseType.CANCEL:
            print("Cancel clicked")
        dialog.destroy()
コード例 #33
0
def calculate_tf_idf_docs():
    #Get the list of documents and their data
    documents = fetch_documents()

    #Preprocess Documents
    preprocessed_documents = []
    for document in documents:
        preprocessed_documents.append(preprocess(document))

    documents = preprocessed_documents

    # Find the list of unique words in the document dataset
    list_of_words = []
    for document in documents:
        for word in document:
            if word not in list_of_words:
                list_of_words.append(word)

    N = len(documents) + 1

    # Generate vector for each document
    copy_documents = documents
    documents_vector = []
    for document in documents:
        doc_vector = []
        for word in list_of_words:
            #Calculate term frequency
            tf = 0
            for term in document:
                if term == word:
                    tf = tf + 1

            #Calculate document frequency
            df = 0
            for copy_document in copy_documents:
                if word in copy_document:
                    df = df + 1

            #Calculate tf-idf
            idf = math.log(N / df)
            tfidf = tf * idf
            doc_vector.append(tfidf)

        documents_vector.append(doc_vector)

    #Generate database
    db = {}
    db['list_of_words'] = list_of_words
    db['N'] = N
    db['documents_vector'] = documents_vector
    db['documents'] = documents

    #Save data to persistence storage
    pickle_out = open(PREPROCESSED_DATA, 'wb')
    pickle.dump(db, pickle_out)
    pickle_out.close()
コード例 #34
0
def interpret(text):
    text, line_nums, indent_str = preprocess(text)
    program = core_parser.parse(text, trace=False)
    context = Context(op_parser, keywords)
    for stmt in program:
        if isinstance(stmt, Block):
            context.keywords[stmt.keyword](stmt.header, stmt.body, context,
                                           context)
        else:
            print(cata(stmt, lambda ast: parse_ops(ast, context)))
コード例 #35
0
def load_image_and_steering_for_train(csv_line):

    correction = 0.2

    steering = float(csv_line[3])
    i = random.randint(0, 2)

    if i == 0:
        img = preprocessor.preprocess(cv2.imread(csv_line[0]))
    elif i == 1:
        img = preprocessor.preprocess(cv2.imread(csv_line[1]))
        steering = steering + correction
    else:
        img = preprocessor.preprocess(cv2.imread(csv_line[2]))
        steering = steering - correction

    img, steering = random_augment(img, steering)

    return img, steering
コード例 #36
0
def generate_ground_data(image_path):
    image, img_txt = read_image(image_path)
    copy = image.copy()
    image, segments, euler_list, central_x, central_y = preprocess(image)
    feature_list = get_feature_list(
        image, segments, euler_list, central_x, central_y)[0]
    classes_list = get_class_list(copy, segments)
    with open("%s" % img_txt, 'wb') as test:
        for char, feature in zip(classes_list, feature_list):
            test.write("%s %s\n" % (chr(char), ' '.join(map(str, feature))))
コード例 #37
0
ファイル: recognizer.py プロジェクト: zhangchaolts/Captcha
def recognize(pic, dir_train_pics):
	pic = Image.open(pic)
	print 'preprocessor.preprocess'
	pic_preprocessed = preprocessor.preprocess(pic)
	block_array = []
	print 'spliter.split'
	spliter.split(pic_preprocessed, block_array)
	captcha = ""
	if len(block_array) == 4:
		print 'recognize_block_array'
		captcha = recognize_block_array(block_array, dir_train_pics)
	return captcha
コード例 #38
0
ファイル: recognizer.py プロジェクト: zhangchaolts/PyQt
def recognize(pic, dir_train_pics):
	pic = Image.open(pic)
	#print 'preprocessor.preprocess'
	pic_preprocessed = preprocessor.preprocess(pic)
	block_array = []
	#print 'spliter.split'
	spliter.split(pic_preprocessed, block_array)
	captcha = ""
	if len(block_array) >= THRESHOLD_BLOCK_NUMBER:
		#print 'recognize_block_array'
		captcha = recognize_block_array(block_array, dir_train_pics)
	return captcha
コード例 #39
0
def run_extractor():
  """Run the full extraction pipeline"""
  
  subprocess.call('mkdir data/reviews', shell=True)
  subprocess.call('mkdir data/tagged', shell=True)
  subprocess.call('mkdir data/untagged', shell=True)
  subprocess.call('mkdir data/to_parse', shell=True)
  subprocess.call('mkdir data/parsed/', shell=True)
  
  preprocessor.preprocess()
  
  subprocess.call('javac -cp ./external/stanford-postagger.jar Tagger.java', shell=True)
  subprocess.call('java -cp ".:./external/stanford-postagger.jar" -Xmx1024m Tagger ./external/left3words-wsj-0-18.tagger data/reviews data/tagged data/untagged', shell=True)
  
  multiword_attr_identifier.identify_multiword_attrs()
  parser_preparation.pre_parse()
  parser.parse_parallel(4)
  extraction_generator.generate_extractions()
  common_extraction_generator.generate_common_extractions()
  attribute_classifier.classify()
  extraction_filterer.filter_extractions()
  polarity_computer.compute_polarities()
コード例 #40
0
ファイル: assembler.py プロジェクト: msiemens/TINY.ASM
def assembler_to_hex(source_code, filename=None, preprocessor_only=False):
    """
    Convert a assembler program to `Tiny` machine code.

    Opcodes described at http://redd.it/1kqxz9
    """

    code = preprocess(source_code, filename or '<input>')

    if preprocessor_only:
        return '\n'.join(c.contents for c in code)

    return assemble(code)
コード例 #41
0
ファイル: view.py プロジェクト: bwesterb/DCPU-16
 def assemble(self):
     text = self.editor.GetText()
     try:
         self.reset(False)
         self.program = assembler.parse(preprocessor.preprocess(text))
         self.emu.load(self.program.assemble())
         self.program_list.update(self.program.instructions)
         self.refresh_debug_info()
     except Exception as e:
         self.reset(False)
         dialog = wx.MessageDialog(self, str(e), 'Error',
             wx.ICON_ERROR | wx.OK)
         dialog.ShowModal()
         dialog.Destroy()
コード例 #42
0
 def __init__(self, program, mode = "MIPS"):
   super(Assembler, self).__init__()
   try:                   text = program.read()
   except AttributeError: text = program
   self.mode = mode.upper()
   self.registers = Registers(self.mode)
   lines = text.split("\n")
   lines = clean(lines, self.mode)
   instrs, data = split_sections(lines)
   self.memory = Memory()
   for d in data: self.memory.insert(d)
   instrs = preprocess(instrs, self.mode)
   self.labels = label_positions(instrs)
   self.instructions = [Instruction(instr) for instr in instrs]
コード例 #43
0
ファイル: data_handler.py プロジェクト: milanjain81/analysis
def get_frame(conf, sensor, location, start, end):
    df = pandas.DataFrame()

    # UUID of the data to retrieve
    uuid = conf[sensor][location]["uuid"]
    # IP Address of the Archiver
    server = conf["archiver"]
    # Port of the Archiver
    port = conf[sensor][location]["archiver_port"]

    # Title of the column in the frame
    title = sensor.title() + "_" + location

    # Get frame for each location
    tframe = get_data(uuid, server, port, title, start, end)

    df = preprocess(tframe)
    return df
コード例 #44
0
    def extract_train(self, sentences, labels):
        ''' Extract feature vectors and numbered labels from training data.
        @param sentences: list of sentences to be extracted
        @param labels: literal labels of each sentence

        @return X: 2D numpy array, feature vectors, one sentence per row
        @return y: 1D numpy array, numbered label of each sentence
        '''
        literal_labels = list(set(labels))
        print "Labels: ", literal_labels
        y = np.array([literal_labels.index(l) for l in labels])

        sentences = [preprocess(s) for s in sentences]
        self.pre_calculate(sentences)

        Xs = []
        X = np.array([self._extract(s) for s in sentences])
        self.literal_labels = literal_labels
        return X, y
コード例 #45
0
def get_features_from_nltk(tweet):

    # is tweet sarcastic
    is_sarcastic = int("#sarcasm" in tweet["text"])

    processed_tweet = preprocess(tweet["text"])
    processed_text = processed_tweet["text"]

    tokens = nltk.word_tokenize(processed_text)
    tokens = [(t.lower()) for t in tokens]

    mean_sentiment = sentiment_helper.score_sentence(tokens)
    positive_sentence_sentiment = mean_sentiment[0]
    negative_sentence_sentiment = mean_sentiment[1]
    sentence_sentiment = mean_sentiment[0] - mean_sentiment[1]

    word_sentiments = []
    for word in processed_text.split(" "):
        if len(word) > 0:
            word_sentiment = sentiment_helper.score_word(word.lower())
            word_sentiments.append(word_sentiment)

    maximum_word_polarity = max([x[0] for x in word_sentiments])
    minimum_word_polarity = max([x[1] for x in word_sentiments])

    polarity_distance_max = maximum_word_polarity - sentence_sentiment
    polarity_distance_min = abs(minimum_word_polarity - sentence_sentiment)

    blob_text = TextBlob(processed_text)
    topic_keywords = blob_text.noun_phrases + processed_tweet["hashtags"] + processed_tweet["mentions"]
    topic_positive, topic_negative, topic_sarcasm = get_topic_sentiment_nltk(topic_keywords)

    return ["{0:.2f}".format(positive_sentence_sentiment),
            "{0:.2f}".format(negative_sentence_sentiment),
            "{0:.2f}".format(sentence_sentiment),
            "{0:.2f}".format(maximum_word_polarity),
            "{0:.2f}".format(minimum_word_polarity),
            "{0:.2f}".format(polarity_distance_max),
            "{0:.2f}".format(polarity_distance_min),
            "{0:.2f}".format(topic_positive),
            "{0:.2f}".format(topic_negative),
            topic_sarcasm,
            is_sarcastic]
コード例 #46
0
ファイル: yasR.py プロジェクト: Niols/yasR
def main():
    try:
        print_title ()

        P = Params()
        P.load()
        P.check_all()

        files_to_rename = get_files_to_rename(P.INPUT_DIRS, P.VIDEO_EXTENSIONS)

        actions_to_process = preprocessor.preprocess(
            files      = files_to_rename,
            language   = P.LANGUAGE,
            output_dir = P.OUTPUT_DIR
        )

        processor.process(
            to_process  = actions_to_process,
            config_path = P.get_path(expanded=True),
            ACTION      = P.ACTION
        )

    except KeyboardInterrupt:
        print()
        log.info('Aborting.')
        exit(1)

    except ConnectionError:
        log.fail('Lost connection. Aborting.')
        exit(2)

    except ConnectionRefusedError:
        log.fail('Lost connection. Aborting.')
        exit(2)

    except KeyError as e:
        if e == 'EDITOR':
            log.fail('Could not find the environment variable EDITOR. Aborting.')
            exit(1)
        else:
            log.fail('Uncaught KeyError exception: %s. Aborting.' % e.args[0])
            exit(2)
コード例 #47
0
def get_features_from_text_blob(tweet):

    # is tweet sarcastic
    is_sarcastic = int("#sarcasm" in tweet["text"])

    # preprocess tweet content
    processed_tweet = preprocess(tweet["text"])
    processed_text = processed_tweet["text"]

    blob_text = TextBlob(processed_text)

    # measure sentiment features of tweet
    sentence_polarity = blob_text.sentiment.polarity
    sentence_subjectivity = blob_text.sentiment.subjectivity

    # calculate word based polarity to capture extreme expressions
    polarities = []
    for word in processed_text.split(" "):
        blob_word = TextBlob(word)
        polarities.append(blob_word.sentiment.polarity)

    maximum_word_polarity = max(polarities)
    minimum_word_polarity = min(polarities)

    # measure how extreme the most expressive is with respect to whole sentence
    polarity_distance_max = maximum_word_polarity - sentence_polarity
    polarity_distance_min = abs(minimum_word_polarity - sentence_polarity)

    # extract topic based sentiment values; combined polarity, subjectivity and any sarcasm clue
    topic_keywords = blob_text.noun_phrases + processed_tweet["hashtags"] + processed_tweet["mentions"]
    topic_polarity, topic_subjectivity, topic_sarcasm = get_topic_sentiment(topic_keywords)

    return ["{0:.2f}".format(sentence_polarity),
            "{0:.2f}".format(sentence_subjectivity),
            "{0:.2f}".format(maximum_word_polarity),
            "{0:.2f}".format(polarity_distance_max),
            "{0:.2f}".format(polarity_distance_min),
            "{0:.2f}".format(topic_polarity),
            "{0:.2f}".format(topic_subjectivity),
            topic_sarcasm,
            is_sarcastic]
コード例 #48
0
ファイル: __init__.py プロジェクト: chenxiaohui/BimCenter
        #print key.center(80,'*')+'\n'
        #print item.Serialize()+'\n'

    for key,value in items.items():
        fp.write(key.center(70,'*')+'\n')
        fp.write(value.Serialize()+'\n')

    
if __name__ == '__main__':
    log.InitLog()       
    px=Parser()

    with open('IFC2X3_TC1.exp','rb') as fp:
    #with open('schema.exp','rb') as fp:
        px.parse(fp)

    dataset=px.dataset
    preprocess(dataset)
    with open('IFC2X3_TC1.json','w') as fp:
    #with open('schema.json','w') as fp:
        toJson(dataset.types,fp)
        toJson(dataset.entities,fp)
        toJson(dataset.rules,fp)
        toJson(dataset.functions,fp)

    generater=Generator(dataset)
    generater.generateCommonFiles()
    generater.generateTypes()
    generater.generateEntities()
    generater.generateIndexes()
コード例 #49
0
ファイル: parser.py プロジェクト: fredreichbier/snowmanlang
def parse(s, parser=None):
    if parser is None:
        parser = Parser(file_prefix='.d_parser_mach_gen')
    return parser.parse(preprocessor.preprocess(s)).structure
コード例 #50
0
		train_filename = sys.argv[i]
	i += 1

print >>sys.stderr, "reading labelled dataset from '" + train_filename + "'..."

input = open(train_filename, "r") if train_filename != "-" else sys.stdin

input.readline()

X = numpy.loadtxt(input, delimiter=",", dtype=numpy.uint8)

labels = X[:,0]
X=X[:,1:].astype(float)

print >>sys.stderr, "training KNN with", min(train_threshold, X.shape[0]), "training instances and k=", k, "..."
clf = make_classifier(preprocess(X[:train_threshold]), labels[:train_threshold], name="KNN", params=[k])

print >>sys.stderr, "making predicitions for", max(0,X.shape[0]-train_threshold), "instances ..."
predictions = clf.predict(preprocess(X[train_threshold:]))

print >>sys.stderr, "evaluating ..."

if verbose:
	for i in range(len(predictions)):
		print labels[train_threshold:][i], predictions[i]
		if labels[train_threshold:][i] != predictions[i]:
			print >>sys.stderr, "should be:", labels[train_threshold:][i], ", was:", predictions[i]
			put_image(X[train_threshold:][i], 0, sys.stderr)
			print >>sys.stderr
else:
	for i in range(len(predictions)):
コード例 #51
0
ファイル: test.py プロジェクト: codehunks/cloud
from preprocessor import preprocess


print preprocess("Sankararaman case: Kanchi seers, other accused acquitted".split(' '))
コード例 #52
0
 def extract(self, sentence):
     '''Extract the feature vector for a testing sentence. The sentence is first turned into a list of words and then the feature extraction logic is delegated to _extract.'''
     return self._extract(preprocess(sentence))
コード例 #53
0
ファイル: main.py プロジェクト: zhangchaolts/Captcha
	dir_path_step = '../../pics/gujinsuo/pics_step'
	dir_path_train = '../../pics/gujinsuo/pics_train/'

	deal_number = 10

	pic_step1 = 1
	pic_step2 = 2
	pic_step3 = 3

	for pic_ptr in xrange(deal_number):

		pic_ptr_str = str('%04d' % pic_ptr)
		image_path = dir_path_base + pic_ptr_str + '.jpg'

		pic = Image.open(image_path)
		pic_preprocessed = preprocessor.preprocess(pic)

		output_path = dir_path_step + str(pic_step1) + '/' + pic_ptr_str + '_' + str(pic_step1) + '.jpg'
		print output_path
		pic_preprocessed.save(output_path)

		block_array = []
		spliter.split(pic_preprocessed, block_array)
		for i in xrange(len(block_array)):
			output_path = dir_path_step + str(pic_step2) + '/' + pic_ptr_str + '_' + str(pic_step2) + '_' + str(i) + '.jpg'
			print output_path
			block_array[i].save(output_path)

	for pic_ptr in xrange(deal_number):

		pic_ptr_str = str('%04d' % pic_ptr)
コード例 #54
0
ファイル: main.py プロジェクト: mmfrb/pln-projeto
  else:
    if sys.argv[1] == 'naivebayes' or sys.argv[1] == 'knn':
      annotated_texts = read('blog-gender-dataset.xlsx')

      training_set_len = 0.7 * len(annotated_texts)

      training_set = []
      test_set = []

      for (text,gender) in annotated_texts:
        if 'M' in gender:
          gender = 'M'
        else:
          gender = 'F'
        if len(training_set) < training_set_len:
          training_set.append((preprocess(text), gender))
        else:
          test_set.append((preprocess(text), gender))

      if sys.argv[1] == 'naivebayes':
        classifier = NaiveBayesClassifier(training_set)

      else:
        classifier = KNNClassifier(training_set, 5)

      print(calculate_metrics(test_set, classifier))

    else:
      print('Invalid classifier name. Choose from [naivebayes, knn]')
コード例 #55
0
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
import numpy
from preprocessor import preprocess, getDataSet, getLabelMap, getAttributeMap
import json

train_file = "/Users/phx/downloads/competetion/recipe/train.json"
with open(train_file) as file:
    data = json.load(file)
print("size of dataset %d" % len(data))

data = preprocess(data)

train_data = [data[i] for i in xrange(0, len(data)) if i % 3 != 0]
test_data = [data[i] for i in xrange(0, len(data)) if i % 3 == 0]

# test_data= preprocess(test_data)

attribute_map = getAttributeMap(train_data, 1)

print("attribute number : %d" % len(attribute_map))
print(attribute_map)


label_map = getLabelMap(data)
print("label number : %d" % len(label_map))
print(label_map)
コード例 #56
0
__author__ = 'phx'

import numpy
from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from preprocessor import preprocess,getAttributeMap,getLabelMap,getDataSet
import json
train_file ="train.json"
with open(train_file) as file:
     data = json.load(file)
print(len(data))

data = preprocess(data)
"""
train_data = [data[i] for i in xrange(0,len(data)) if i%4 !=0]
test_data = [data[i] for i in xrange(0,len(data)) if i%4 ==0]
"""
train_data = data


test_file ="/Users/phx/downloads/competetion/recipe/test.json"
with open(test_file) as file:
     test_data = json.load(file)
print(len(test_data))
test_data= preprocess(test_data)


attribute_map = getAttributeMap(train_data,1)
コード例 #57
0
            print "filename: ", xml_filename

            text_filepath = outputloc + xml_filename + ".txt"

            pt = ParseText(xml_filepath, text_filepath)
            content = pt.readXmlToString()

            #content_list variable is not used anymore, but is still used in the getBio code that I left so I left this here too
            #content_list = pt.readXMLToList()
            if content in xmlset: continue # Skip duplicates
            xmlset.add(content)

            soup = BeautifulSoup(content, "html.parser")

            #Preprocess using preprocessor.py
            preprocess(soup)

            #Get headings
            headings = pt.findHeadings(PROBABLE_HEADINGS, soup)
            headingsclean = [h.get_text() for h in headings]
            
            #bio = pt.find_bio(content, content_list, headings, heading_indexes)


            #Use find_this function to find edu, exp, leadership, skills, languages, volunteer
            edu, isXml = pt.find_this(soup, ["education", "educaton"], [])
            exp, isXml = pt.find_this(soup, ["experience", "employment", 'career', 'history', 'professional', 'work'], ['objective', 'course'])
            leadExp, x = pt.find_this(soup, ["leadership", 'community', 'extracurricular', 'activities', 'organizations'], [])
            skills, isXml = pt.find_this(soup, ["kills"], [])
            languages, isXml = pt.find_this(soup, ["languages", 'foreign'], ['computer', 'programming'])