def extraction(cleandf): try: (unigramSrc, bigramSrc, trigramSrc, unigramTgt, bigramTgt, trigramTgt, unigramSrcPos, bigramSrcPos, trigramSrcPos, unigramTgtPos, bigramTgtPos, trigramTgtPos) = corpf.loadNLP() except: (unigramSrc, bigramSrc, trigramSrc, unigramTgt, bigramTgt, trigramTgt, unigramSrcPos, bigramSrcPos, trigramSrcPos, unigramTgtPos, bigramTgtPos, trigramTgtPos) = corpf.getNgramModels() lc.savemodel(unigramSrc, '../data/unigramSrc.joblib') lc.savemodel(bigramSrc, '../data/bigramSrc.joblib') lc.savemodel(trigramSrc, '../data/trigramSrc.joblib') lc.savemodel(unigramTgt, '../data/unigramTgt.joblib') lc.savemodel(bigramTgt, '../data/bigramTgt.joblib') lc.savemodel(trigramTgt, '../data/trigramTgt.joblib') lc.savemodel(unigramSrcPos, '../data/unigramSrcPos.joblib') lc.savemodel(bigramSrcPos, '../data/bigramSrcPos.joblib') lc.savemodel(trigramSrcPos, '../data/trigramSrcPos.joblib') lc.savemodel(unigramTgtPos, '../data/unigramTgtPos.joblib') lc.savemodel(bigramTgtPos, '../data/bigramTgtPos.joblib') lc.savemodel(trigramTgtPos, '../data/trigramTgtPos.joblib') dfnew = ex.extractor(cleandf, unigramSrc, bigramSrc, trigramSrc, unigramTgt, bigramTgt, trigramTgt, unigramSrcPos, bigramSrcPos, trigramSrcPos, unigramTgtPos, bigramTgtPos, trigramTgtPos) dfnew.to_csv('../data/features.csv', encoding='utf-8', index=False) return dfnew
def do_analysis(): data = extractor(request.form['url']) if data: r = requests.post("http://localhost:8080/fakebox/check", data={ "url": data['url'], "title": data['title'], "content": data['content'] }) j = json.loads(r.text) conn = sqlite3.connect('db.sqlite') c = conn.cursor() c.execute("SELECT * FROM politicians") politicians = c.fetchall() contains = [] for politician in politicians: if data['content'].count(politician[1]) > 0 or data['title'].count( politician[1]) > 0: contains.append(politician[1]) return render_template('analyse_result.html', url=request.form['url'], title=data['title'], dtitle=j['title']["decision"], dcontent=j['content']["decision"], contains=contains) return 'ERROR : URL Not Supported!'
def extract_all(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) title_keywords = {} abs_keywords = {} ext = extractor.extractor(get_db('wikipedia')) cur = get_db('arnet_db') cur.execute("select id, title from publication") cnt, tot = 0, cur.rowcount for id, title in cur.fetchall(): if cnt % 100 == 0: logging.info("loading %d/%d" % (cnt, tot)) cnt += 1 keywords = ext.extract_str(title) if len(keywords) > 0: title_keywords[id] = keywords cur.execute("select abstract from publication_ext where id = %s", id) abs = cur.fetchone() if abs is not None: abs = abs[0] if abs is not None: keywords = ext.extract_str(abs) if len(keywords) > 0: abs_keywords[id] = keywords logging.info('dumping title_keywords') cPickle.dump(title_keywords, open('title_keywords.dump', 'wb')) logging.info('dumping abs_keywords') cPickle.dump(abs_keywords, open('abs_keywords.dump', 'wb'))
def extract_all(bulk_info = (80000000, 0)): bulk_size, bulk_no = bulk_info logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # ext = extractor.extractor(get_db('wikipedia')) ext = extractor.extractor('wiki_dump.txt', db = False) mongodb = get_mongodb() pubs = mongodb.publication_dupl word_colls = mongodb.keywords cnt, tot = 0, pubs.count() for doc in pubs.find(skip = bulk_size * bulk_no, limit = bulk_size): if cnt % 100 == 0 and bulk_no == 0: # logging.info("loading title %d/%d" % (cnt, tot)) logging.info("loading abstract %d/%d" % (cnt, tot)) cnt += 1 if 'lang' in doc and doc['lang'] == 'zh': continue # title = doc['title'] if 'title' in doc else '' abs = doc['abstract'] if 'abstract' in doc else '' # title_keywords = ext.extract_str(title) abstract_keywords = ext.extract_str(abs) # word_colls.update_one({'_id': doc['_id']}, {'$set': {'title_keywords': title_keywords}}, upsert = True) word_colls.update_one({'_id': doc['_id']}, {'$set': {'abstract_keywords': abstract_keywords}}, upsert = True)
def main(): print 'Подготовка материалов...' delete_folder_with_files(middle_data) os.mkdir(middle_data) delete_folder_with_files(extractor_data) os.mkdir(extractor_data) # до насадки разбираемся с смайликами print 'Ищем и преобразовываем смайлики...' features.emoticon_detection(marked_data, middle_data) # обязательная насадка без удаления повторяющихся букв - прогонка через mystem + убираем знаки препинания #print 'Прогоняем через mystem и убираем знаки препинания...' #features.mystem_using(marked_data, middle_data) # обязательная насадка с удалением повторяющихся букв - прогонка через mystem + убираем знаки препинания print 'Прогоняем через mystem (удаляем подряд идущие повторяющиеся буквы) и убираем знаки препинания...' features.mystem_using_with_considering_of_multiple_letters(middle_data, middle_data) # убирает повторяющиеся буквы, фича, не насадка print 'Удаляем подряд идущие повторяющиеся буквы...' features.considering_of_multiple_letters(middle_data) # удаление предлогов #print 'Убираем предлоги...' #features.without_prepositions(middle_data) # удаление союзов #print 'Убираем союзы...' #features.without_conjunctions(middle_data) # удаление местоимений #print 'Убираем местоимения...' #features.without_pronouns(middle_data) # частица не print 'Присоединяем частицу не к словам не состояниям...' features.with_not(middle_data) # убираем термы на английском языке print 'Убираем термы на английском языке...' features.without_foreign_words(middle_data) # 1, 2, ..., n-граммы #print '1, 2, 3, .. n граммы...' #features.more_than_n_gram_feature(2, middle_data) # n-граммы #print 'n граммы...' #features.n_gram_feature(3, middle_data) print 'Экстрактор работает...' my_extractor = extractor.extractor(middle_data, extractor_data) if my_extractor.extract() == False: raise Exception('Error in extractor!') print 'Запускаем машинное обучение...' executeMlAndPrintAccurancy(naive_bayes_gaussian_count.NaiveBayesGaussian(extractor_data)) executeMlAndPrintAccurancy(naive_bayes_multinomial_count.NaiveBayesMultinomial(extractor_data))
def detection(test_dir): resultsDir,logDir = generate_path(test_dir) params = parameters() ex = extractor(params) X,original_dataset=ex.generate_feature(test_dir,'test') classifier = attack_detection(params.model_dir) print('Start evaluation...') classifier.fit_predict(X) classifier.filter_attckSample(original_dataset,resultsDir) classifier.reset() print('evaluation finish...') print('results save in:'+resultsDir) print('logs save in:'+logDir)
def work(device, queue_in, queue_out): config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True session = tf.Session(config=config) extractor_ = extractor(session, [device], 1) aligner_ = aligner(session, [device], 1) for image in queue_in: image = np.stack([image], axis=0) image = aligner_.align(image) features = extractor_.extract(image) #print(len(features)) for feature in features: queue_out.append(feature)
def extract(): global option global uploaded_files if len(uploaded_files) < 1: return "Choose some files first" if option is None: return "Pick the option first" informations = [] for sentences in news: informations += extractor(keyword, sentences, option) return render_template("index.html", informations=informations, query=keyword)
def traductor(path_to_pdf): lop = extractor.extractor(path_to_pdf)[1] idx = 0 str_var = '' for p in lop: #print(p) fails = 0 vc = True while vc: try: pt = translator.translator(p) print(pt) if pt is not None: str_var += pt + '\n' vc = False except ConnectError as e: if fails < 3: time.sleep(5) print('sleeping a moment') else: vc = False str_var += '%s not posible to trans' % idx idx += 1 time.sleep(1) return str_var
def handle(request): name = request.match_info['file_name'] html_response = extractor.extractor(name) return web.Response(text=html_response, content_type='text/html')
home_dir = sys.argv[1] write_dir = sys.argv[2] index = np.float(sys.argv[3]) proj = sys.argv[4] os.chdir(home_dir + 'code') from extractor import gdoc_query, extractor from cluster_utilities import * ### take arguemnt and read in template reader df_reader = pd.read_csv(write_dir + 'inpatient_template_reader.csv') ## hard code path data_path = write_dir ## loop thru everything. func = extractor() func.format_survey_info_1(df_reader.iloc[np.int(index)].copy().to_dict()) func.check_number_of_responses_2() iso = func.reader['ISO3'][0] visit_type = func.reader['type'][0] title = func.reader['title'][0] nid = func.reader['nid'][0] func.read_in_data_3() print 'done with data' filename = iso + '_' + nid + '_' + title + '_' + visit_type + '_' + 'raw_data.p' filename = filename.lower()
def dosync(self): print("Email incoming...") with MailBox(IMAP_SERVER).login(EMAIL, PASSWORD, "INBOX") as mailbox: email = mailbox.fetch(limit=1, reverse=True) print("Email retrieved !") extractor(next(email))
#!/usr/bin/env python # -*- coding: utf-8 -*- from extractor import extractor from normalizer import normalizer from converter import converter if __name__ == "__main__": results = extractor() nodes, ties = normalizer(results) converter(nodes, ties)
# -*- coding: utf-8 -*- from extractor import extractor ext = extractor() # loading some sigs ext.load_signature('sigs\\A_2.png', {'name': '1st'}) ext.load_signature('sigs\\B_3.png', {'name': 'second'}) ext.load_signature('sigs\\genuine-10.png', {'name': 'third'}) ext.load_signature('sigs\\genuine-12.png', {'name': 'fourth'}) ext.prepare('pdfs\\5_Scan_18022019_192748.pdf') #this gives me the matches for all the four above signatures, based on the document payload = ext.extract()
class mediumClass: jsonHandler = handler_json() csvHandler = handler_csv() downloadHandler = extractor() def __init__(self): debug.debug_print("Medium Class is up", 1) def generate_allSocialDistancingData(self): statesData = self.csvHandler._loadData('states.csv')[0] for state in statesData: fips = int(state['state_fips'], 10) self.downloadHandler.get_socialDistancingData(fips, 'temp.json') # First step, create socialDistancing.csv file if state == statesData[0]: self.jsonHandler.transform_jsonToCsv_socialDistancingData('temp.json', 'socialDistancing.csv') # Other steps, merge new data to socialDistancing.csv file else: self.jsonHandler.transform_jsonToCsv_socialDistancingData('temp.json', 'temp.csv') self.csvHandler.merge_csvFiles_addRows('socialDistancing.csv', 'temp.csv', 'socialDistancing.csv') # This functions remove useless stations from it's csv files. useless mean the stations that their max-date is less than 2020-1-22 def clean_stations(self): stationsData = [] fieldnames = [] with open(_CSV_Directory_ + 'temp-stations.csv') as csvFile: csvDriver = csv.DictReader(csvFile) fieldnames = csvDriver.fieldnames for row in csvDriver: stationsData.append(row) with open(_CSV_Directory_ + 'new_stations.csv', 'w') as csvFile: csvDriver = csv.DictWriter(csvFile, fieldnames) csvDriver.writeheader() startDay = date.fromisoformat('2020-01-22') for station in stationsData: try: if date.fromisoformat(station['maxdate']) > startDay: csvDriver.writerow(station) except: continue debug.debug_print("SUCCESS: useless stations removed", 2) def generate_allWeatherData(self, startDate, endDate): stationsData = self.csvHandler._loadData('temp-stations.csv')[0] numberOfStations = len(stationsData) # progressBarWidget = [progressbar.Percentage(), # ' ', # progressbar.Bar('#', '|', '|'), # ' ', # progressbar.Variable('FIPS', width=12, precision=12), # ' ', # progressbar.Variable('ID', width=12, precision=12), # ] # progressBar = progressbar.ProgressBar(maxval=numberOfStations, widgets=progressBarWidget, redirect_stdout=True) # progressBar.start() step = 0 try: logFile = open('weather.log', 'r') step = int(logFile.read(), 10) logFile.close() except: logFile = open('weather.log', 'w') logFile.write(str(step)) logFile.close() for i in range(step, numberOfStations): with open('weather.log', 'w') as logFile: logFile.write(str(i)) stationID = stationsData[i]['id'].split(':')[1] countyFips = stationsData[i]['county_fips'] # progressBar.update(i, FIPS=countyFips, ID=stationID) # First step, create weather.csv file if i == 0: self.downloadHandler.get_countyWeatherData(countyFips, stationID, startDate, endDate, 'new-weather.csv') # Other steps, merge new data to weather.csv file else: self.downloadHandler.get_countyWeatherData(countyFips, stationID, startDate, endDate, 'temp.csv') self.csvHandler.merge_csvFiles_addRows('new-weather.csv', 'temp.csv', 'new-weather.csv') # progressBar.finish() debug.debug_print("SUCCESS: data extracted (weather data)", 2)
def extract_body(self, document): the_extractor = extractor() return the_extractor.extract(document)
import random parser = argparse.ArgumentParser() parser.add_argument('--devices', default='/gpu:0') parser.add_argument('--extractor_batch_size', default=256, type=int) parser.add_argument('--aligner_batch_size', default=64, type=int) args = parser.parse_args() args.devices = args.devices.split(',') config = tf.ConfigProto() config.allow_soft_placement = False config.gpu_options.allow_growth = True session = tf.Session(config=config) aligner = aligner(session, args.devices, args.aligner_batch_size) extractor = extractor(session, args.devices, args.extractor_batch_size) def batch_process(f, x, s): results = [] for i in range(0, len(x), s): x_ = x[i:i + s] if len(x_) != s: x_ += [x_[0]] * (s - len(x_)) y_ = f(x_) for j in y_: if len(results) < len(x): results.append(j) print(len(results), 'done') return results
import extractor print(extractor.extractor("exemplo.jpeg"))