Esempio n. 1
0
def extraction(cleandf):
    try:
        (unigramSrc, bigramSrc, trigramSrc, unigramTgt, bigramTgt, trigramTgt,
         unigramSrcPos, bigramSrcPos, trigramSrcPos, unigramTgtPos,
         bigramTgtPos, trigramTgtPos) = corpf.loadNLP()
    except:
        (unigramSrc, bigramSrc, trigramSrc, unigramTgt, bigramTgt, trigramTgt,
         unigramSrcPos, bigramSrcPos, trigramSrcPos, unigramTgtPos,
         bigramTgtPos, trigramTgtPos) = corpf.getNgramModels()
        lc.savemodel(unigramSrc, '../data/unigramSrc.joblib')
        lc.savemodel(bigramSrc, '../data/bigramSrc.joblib')
        lc.savemodel(trigramSrc, '../data/trigramSrc.joblib')
        lc.savemodel(unigramTgt, '../data/unigramTgt.joblib')
        lc.savemodel(bigramTgt, '../data/bigramTgt.joblib')
        lc.savemodel(trigramTgt, '../data/trigramTgt.joblib')
        lc.savemodel(unigramSrcPos, '../data/unigramSrcPos.joblib')
        lc.savemodel(bigramSrcPos, '../data/bigramSrcPos.joblib')
        lc.savemodel(trigramSrcPos, '../data/trigramSrcPos.joblib')
        lc.savemodel(unigramTgtPos, '../data/unigramTgtPos.joblib')
        lc.savemodel(bigramTgtPos, '../data/bigramTgtPos.joblib')
        lc.savemodel(trigramTgtPos, '../data/trigramTgtPos.joblib')
    dfnew = ex.extractor(cleandf, unigramSrc, bigramSrc, trigramSrc,
                         unigramTgt, bigramTgt, trigramTgt, unigramSrcPos,
                         bigramSrcPos, trigramSrcPos, unigramTgtPos,
                         bigramTgtPos, trigramTgtPos)
    dfnew.to_csv('../data/features.csv', encoding='utf-8', index=False)
    return dfnew
def do_analysis():
    data = extractor(request.form['url'])
    if data:
        r = requests.post("http://localhost:8080/fakebox/check",
                          data={
                              "url": data['url'],
                              "title": data['title'],
                              "content": data['content']
                          })
        j = json.loads(r.text)

        conn = sqlite3.connect('db.sqlite')
        c = conn.cursor()
        c.execute("SELECT * FROM politicians")
        politicians = c.fetchall()

        contains = []

        for politician in politicians:
            if data['content'].count(politician[1]) > 0 or data['title'].count(
                    politician[1]) > 0:
                contains.append(politician[1])

        return render_template('analyse_result.html',
                               url=request.form['url'],
                               title=data['title'],
                               dtitle=j['title']["decision"],
                               dcontent=j['content']["decision"],
                               contains=contains)
    return 'ERROR : URL Not Supported!'
Esempio n. 3
0
def extract_all():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    title_keywords = {}
    abs_keywords = {}
    ext = extractor.extractor(get_db('wikipedia'))

    cur = get_db('arnet_db')
    cur.execute("select id, title from publication")
    cnt, tot = 0, cur.rowcount
    for id, title in cur.fetchall():
        if cnt % 100 == 0:
            logging.info("loading %d/%d" % (cnt, tot))
        cnt += 1

        keywords = ext.extract_str(title)
        if len(keywords) > 0: title_keywords[id] = keywords

        cur.execute("select abstract from publication_ext where id = %s", id)
        abs = cur.fetchone()
        if abs is not None:
            abs = abs[0]
            if abs is not None:
                keywords = ext.extract_str(abs)
                if len(keywords) > 0: abs_keywords[id] = keywords

    logging.info('dumping title_keywords')
    cPickle.dump(title_keywords, open('title_keywords.dump', 'wb'))

    logging.info('dumping abs_keywords')
    cPickle.dump(abs_keywords, open('abs_keywords.dump', 'wb'))
Esempio n. 4
0
def extract_all(bulk_info = (80000000, 0)):
    bulk_size, bulk_no = bulk_info
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # ext = extractor.extractor(get_db('wikipedia'))
    ext = extractor.extractor('wiki_dump.txt', db = False)

    mongodb = get_mongodb()
    pubs = mongodb.publication_dupl
    word_colls = mongodb.keywords
    cnt, tot = 0, pubs.count()
    for doc in pubs.find(skip = bulk_size * bulk_no, limit = bulk_size):
        if cnt % 100 == 0 and bulk_no == 0:
            # logging.info("loading title %d/%d" % (cnt, tot))
            logging.info("loading abstract %d/%d" % (cnt, tot))
        cnt += 1

        if 'lang' in doc and doc['lang'] == 'zh': continue

        # title = doc['title'] if 'title' in doc else ''
        abs = doc['abstract'] if 'abstract' in doc else ''

        # title_keywords = ext.extract_str(title)
        abstract_keywords = ext.extract_str(abs)

        # word_colls.update_one({'_id': doc['_id']}, {'$set': {'title_keywords': title_keywords}}, upsert = True)
        word_colls.update_one({'_id': doc['_id']}, {'$set': {'abstract_keywords': abstract_keywords}}, upsert = True)
Esempio n. 5
0
def main():
    print 'Подготовка материалов...'
    delete_folder_with_files(middle_data)
    os.mkdir(middle_data)    
    delete_folder_with_files(extractor_data)
    os.mkdir(extractor_data)     

    # до насадки разбираемся с смайликами
    print 'Ищем и преобразовываем смайлики...' 
    features.emoticon_detection(marked_data, middle_data)
    
    # обязательная насадка без удаления повторяющихся букв - прогонка через mystem + убираем знаки препинания
    #print 'Прогоняем через mystem и убираем знаки препинания...'
    #features.mystem_using(marked_data, middle_data)        
    
    # обязательная насадка с удалением повторяющихся букв - прогонка через mystem + убираем знаки препинания    
    print 'Прогоняем через mystem (удаляем подряд идущие повторяющиеся буквы) и убираем знаки препинания...'
    features.mystem_using_with_considering_of_multiple_letters(middle_data, middle_data)

    # убирает повторяющиеся буквы, фича, не насадка
    print 'Удаляем подряд идущие повторяющиеся буквы...'  
    features.considering_of_multiple_letters(middle_data)    
        
    # удаление предлогов
    #print 'Убираем предлоги...'
    #features.without_prepositions(middle_data)    
    
    # удаление союзов
    #print 'Убираем союзы...'
    #features.without_conjunctions(middle_data)    
    
    # удаление местоимений
    #print 'Убираем местоимения...'
    #features.without_pronouns(middle_data)         
    
    # частица не
    print 'Присоединяем частицу не к словам не состояниям...'
    features.with_not(middle_data)    
    
    # убираем термы на английском языке
    print 'Убираем термы на английском языке...'
    features.without_foreign_words(middle_data)    
    
    # 1, 2, ..., n-граммы
    #print '1, 2, 3, .. n граммы...'
    #features.more_than_n_gram_feature(2, middle_data)
    
    # n-граммы
    #print 'n граммы...'
    #features.n_gram_feature(3, middle_data) 
    
    print 'Экстрактор работает...'    
    my_extractor = extractor.extractor(middle_data, extractor_data)
    if my_extractor.extract() == False:
        raise Exception('Error in extractor!')

    print 'Запускаем машинное обучение...' 
    executeMlAndPrintAccurancy(naive_bayes_gaussian_count.NaiveBayesGaussian(extractor_data))
    executeMlAndPrintAccurancy(naive_bayes_multinomial_count.NaiveBayesMultinomial(extractor_data))
Esempio n. 6
0
def detection(test_dir):
    resultsDir,logDir = generate_path(test_dir)
    params = parameters()
    ex = extractor(params)
    X,original_dataset=ex.generate_feature(test_dir,'test')
    classifier = attack_detection(params.model_dir) 
    print('Start evaluation...')
    classifier.fit_predict(X) 
    classifier.filter_attckSample(original_dataset,resultsDir)
    classifier.reset()
    print('evaluation finish...')
    print('results save in:'+resultsDir)
    print('logs save in:'+logDir)
Esempio n. 7
0
def work(device, queue_in, queue_out):

    config = tf.ConfigProto()
    config.allow_soft_placement = True
    config.gpu_options.allow_growth = True
    session = tf.Session(config=config)

    extractor_ = extractor(session, [device], 1)
    aligner_ = aligner(session, [device], 1)

    for image in queue_in:
        image = np.stack([image], axis=0)
        image = aligner_.align(image)
        features = extractor_.extract(image)
        #print(len(features))
        for feature in features:
            queue_out.append(feature)
def extract():

    global option
    global uploaded_files

    if len(uploaded_files) < 1:
        return "Choose some files first"

    if option is None:
        return "Pick the option first"

    informations = []

    for sentences in news:
        informations += extractor(keyword, sentences, option)

    return render_template("index.html",
                           informations=informations,
                           query=keyword)
Esempio n. 9
0
def traductor(path_to_pdf):

    lop = extractor.extractor(path_to_pdf)[1]

    idx = 0
    str_var = ''

    for p in lop:

        #print(p)

        fails = 0
        vc = True

        while vc:
            try:

                pt = translator.translator(p)
                print(pt)

                if pt is not None:

                    str_var += pt + '\n'
                    vc = False

            except ConnectError as e:

                if fails < 3:

                    time.sleep(5)
                    print('sleeping a moment')

                else:

                    vc = False
                    str_var += '%s not posible to trans' % idx
        idx += 1
        time.sleep(1)

    return str_var
Esempio n. 10
0
def handle(request):
    name = request.match_info['file_name']
    html_response = extractor.extractor(name)
    return web.Response(text=html_response, content_type='text/html')
Esempio n. 11
0
home_dir = sys.argv[1]
write_dir = sys.argv[2]
index = np.float(sys.argv[3])
proj = sys.argv[4]
os.chdir(home_dir + 'code')
from extractor import gdoc_query, extractor
from cluster_utilities import *

### take arguemnt and read in template reader
df_reader = pd.read_csv(write_dir + 'inpatient_template_reader.csv')

## hard code path
data_path = write_dir

## loop thru everything.
func = extractor()
func.format_survey_info_1(df_reader.iloc[np.int(index)].copy().to_dict())

func.check_number_of_responses_2()

iso = func.reader['ISO3'][0]
visit_type = func.reader['type'][0]
title = func.reader['title'][0]
nid = func.reader['nid'][0]
func.read_in_data_3()

print 'done with data'

filename = iso + '_' + nid + '_' + title + '_' + visit_type + '_' + 'raw_data.p'
filename = filename.lower()
Esempio n. 12
0
 def dosync(self):
     print("Email incoming...")
     with MailBox(IMAP_SERVER).login(EMAIL, PASSWORD, "INBOX") as mailbox:
         email = mailbox.fetch(limit=1, reverse=True)
         print("Email retrieved !")
         extractor(next(email))
Esempio n. 13
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from extractor import extractor
from normalizer import normalizer
from converter import converter

if __name__ == "__main__":

    results = extractor()
    nodes, ties = normalizer(results)
    converter(nodes, ties)
Esempio n. 14
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from extractor import extractor
from normalizer import normalizer
from converter import converter

if __name__ == "__main__":

  results = extractor()
  nodes, ties = normalizer(results)
  converter(nodes, ties)
Esempio n. 15
0
# -*- coding: utf-8 -*-

from extractor import extractor

ext = extractor()

# loading some sigs
ext.load_signature('sigs\\A_2.png', {'name': '1st'})
ext.load_signature('sigs\\B_3.png', {'name': 'second'})
ext.load_signature('sigs\\genuine-10.png', {'name': 'third'})
ext.load_signature('sigs\\genuine-12.png', {'name': 'fourth'})

ext.prepare('pdfs\\5_Scan_18022019_192748.pdf')
#this gives me the matches for all the four above signatures, based on the document
payload = ext.extract()
Esempio n. 16
0
class mediumClass:
    jsonHandler = handler_json()
    csvHandler = handler_csv()
    downloadHandler = extractor()
    def __init__(self):
        
        debug.debug_print("Medium Class is up", 1)

    def generate_allSocialDistancingData(self):
        statesData = self.csvHandler._loadData('states.csv')[0]
        for state in statesData:
            fips = int(state['state_fips'], 10)
            self.downloadHandler.get_socialDistancingData(fips, 'temp.json')
            # First step, create socialDistancing.csv file
            if state == statesData[0]:
                self.jsonHandler.transform_jsonToCsv_socialDistancingData('temp.json', 'socialDistancing.csv')
            # Other steps, merge new data to socialDistancing.csv file
            else:
                self.jsonHandler.transform_jsonToCsv_socialDistancingData('temp.json', 'temp.csv')
                self.csvHandler.merge_csvFiles_addRows('socialDistancing.csv', 'temp.csv', 'socialDistancing.csv')

    # This functions remove useless stations from it's csv files. useless mean the stations that their max-date is less than 2020-1-22
    def clean_stations(self):
        stationsData = []
        fieldnames = []
        with open(_CSV_Directory_ + 'temp-stations.csv') as csvFile:
            csvDriver = csv.DictReader(csvFile)
            fieldnames = csvDriver.fieldnames
            for row in csvDriver:
                stationsData.append(row)

        with open(_CSV_Directory_ + 'new_stations.csv', 'w') as csvFile:
            csvDriver = csv.DictWriter(csvFile, fieldnames)
            csvDriver.writeheader()
            startDay = date.fromisoformat('2020-01-22')
            for station in stationsData:
                try:
                    if date.fromisoformat(station['maxdate']) > startDay:
                        csvDriver.writerow(station)
                except:
                    continue

        debug.debug_print("SUCCESS: useless stations removed", 2)

    def generate_allWeatherData(self, startDate, endDate):
        stationsData = self.csvHandler._loadData('temp-stations.csv')[0]

        numberOfStations = len(stationsData)
        # progressBarWidget = [progressbar.Percentage(),
        # ' ',
        # progressbar.Bar('#', '|', '|'),
        # ' ',
        # progressbar.Variable('FIPS', width=12, precision=12),
        # ' ',
        # progressbar.Variable('ID', width=12, precision=12),
        # ]
        # progressBar = progressbar.ProgressBar(maxval=numberOfStations, widgets=progressBarWidget, redirect_stdout=True)
        # progressBar.start()

        step = 0
        try:
            logFile = open('weather.log', 'r')
            step = int(logFile.read(), 10)
            logFile.close()
        except:
            logFile = open('weather.log', 'w')
            logFile.write(str(step))
            logFile.close()
        
        for i in range(step, numberOfStations):
            with open('weather.log', 'w') as logFile:
                logFile.write(str(i))

            stationID = stationsData[i]['id'].split(':')[1]
            countyFips = stationsData[i]['county_fips']
            # progressBar.update(i, FIPS=countyFips, ID=stationID)
            # First step, create weather.csv file
            if i == 0:
                self.downloadHandler.get_countyWeatherData(countyFips, stationID, startDate, endDate, 'new-weather.csv')
            # Other steps, merge new data to weather.csv file
            else:
                self.downloadHandler.get_countyWeatherData(countyFips, stationID, startDate, endDate, 'temp.csv')
                self.csvHandler.merge_csvFiles_addRows('new-weather.csv', 'temp.csv', 'new-weather.csv')

        # progressBar.finish()
        debug.debug_print("SUCCESS: data extracted (weather data)", 2)
Esempio n. 17
0
    def extract_body(self, document):

	the_extractor = extractor()
	return the_extractor.extract(document)
Esempio n. 18
0
import random

parser = argparse.ArgumentParser()
parser.add_argument('--devices', default='/gpu:0')
parser.add_argument('--extractor_batch_size', default=256, type=int)
parser.add_argument('--aligner_batch_size', default=64, type=int)
args = parser.parse_args()
args.devices = args.devices.split(',')

config = tf.ConfigProto()
config.allow_soft_placement = False
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

aligner = aligner(session, args.devices, args.aligner_batch_size)
extractor = extractor(session, args.devices, args.extractor_batch_size)


def batch_process(f, x, s):
    results = []
    for i in range(0, len(x), s):
        x_ = x[i:i + s]
        if len(x_) != s:
            x_ += [x_[0]] * (s - len(x_))
        y_ = f(x_)
        for j in y_:
            if len(results) < len(x):
                results.append(j)
        print(len(results), 'done')
    return results
Esempio n. 19
0
import extractor

print(extractor.extractor("exemplo.jpeg"))