def __init__(self, df_data): self.df_data = df_data self.network = None self.network_adjusted = None self.df_types = None self.data_df = None self.d_id_title = None self.d_title_id = None self.views_df = None self.extract_data = ExtractData() self.logger = logging.getLogger(__name__)
def execute(self): all_files = [ splitext(f)[0] for f in os.listdir(self.INPUT_FOLDER_PATH) ] all_files.sort() print(all_files) for filename in all_files: if filename != '.DS_Store': PDFtoText(filename).execute() ExtractData(filename).execute() self.remove_file(filename)
def details(): payload = { 'email': '*****@*****.**', 'password': '******' } res_1 = requests.post( 'http://api.passivereferral.com/index.php/api/authenticate/', json=payload) tok = res_1.json() token = tok['token'] url = r'http://api.passivereferral.com/index.php/api/getsmtp/?token=' + token res = requests.get(url, json=payload) response = res.json()[0] user_name = response['user_name'] password = response['password'] mail_server = response['smtpname'] ssl_enabling = response['ssl_enabled'] token = response['token'] if ssl_enabling == '0': port = None y = ExtractData(mail_server, None, False, user_name, password, token) y.convert_into_html()
class FileHandler: def __init__(self, df_data): self.df_data = df_data self.network = None self.network_adjusted = None self.df_types = None self.data_df = None self.d_id_title = None self.d_title_id = None self.views_df = None self.extract_data = ExtractData() self.logger = logging.getLogger(__name__) def organize_data(self): self.logger.info("organize data") try: data_preprocessing = DataPreprocessing(self.df_data) network, network_adjusted, df_types = data_preprocessing.preprocessing( ) data_df = self.extract_data.create_final_df(network_adjusted) # data_df.to_csv("data_df.csv") d_id_title, d_title_id = self.extract_data.find_dicts(data_df) views_df = self.extract_data.create_views_df(data_df) return network, network_adjusted, df_types, data_df, d_id_title, d_title_id, views_df except Exception as err: self.logger.info(f"encounter error: {str(err), err.args}") def process_files(self): self.logger.info("process_files") network, network_adjusted, df_types, data_df, d_id_title, d_title_id, views_df = \ self.organize_data() self.network = network self.network_adjusted = network_adjusted self.df_types = df_types self.data_df = data_df self.d_id_title = d_id_title self.d_title_id = d_title_id self.views_df = views_df
# -*- coding: utf-8 -*- from extract_data import ExtractData from math import degrees parser = ExtractData() parser.ExtractError() def parse(): error = open("errors.csv") X = [] Y = [] ANGLE = [] for line in error: data = line.split(",") X.append(float(data[0])) Y.append(float(data[1])) ANGLE.append(float(data[2])) return X, Y, ANGLE x_error, y_error, angle_error = parse() import pylab t = [i * 0.2 for i in range(0, len(x_error))] pylab.plot(t, x_error, 'r') pylab.plot(t, y_error, 'g') ax1 = pylab.gca() ax1.set_xlabel("Time [s]") ax1.set_ylabel("Distance error [cm]") pylab.legend((r'$x$', r'$y$'), shadow=True, loc=(0.84, 0.84))
import sys from extract_data import ExtractData from preprocess_data import PreprocessData from utils import read_json if __name__ == '__main__': path_documents = "" list_documents = glob.glob(path_documents + '*.json') # Path of each document for path_doc in list_documents: # Load json data = read_json(path_doc) # object obj_data = ExtractData(data) # call get_paper_id method # call get_title method # call get_text method # Object to pre-process the text obj_preprocess = PreprocessData(text) # Convert the text to lower case # Remove punctuation # Remove numbers # Remove stop words
from preprocess_data import PreprocessData from utils import read_json from utils import write_file language = 'en' if __name__ == '__main__': path_documents = sys.argv[1] list_documents = glob.glob(path_documents + '*.json') print("nombre de documents traitées :", len(list_documents)) # Path of each document for path_doc in list_documents: # Load json data = read_json(path_doc) # object obj_data = ExtractData(data) # call get_text method text = obj_data.get_text() # make sure the article is in english if detect(text) == language: # call get_paper_id method paper_id = obj_data.get_paper_id() # call get_title method title = obj_data.get_title() # Object to pre-process the text obj_preprocess = PreprocessData(text) # Convert the text to lower case
from utils import * from preprocess_data import PreprocessData from extract_data import ExtractData dic = {'paper_id': 'ABC00001', 'metadata': {'title': 'Covid', 'authors': []}, 'body_text': [{'text': 'Covid is coron virus', 'cite_spans': [], 'section': '', 'ref_spans': []}], 'ref_entries': {}, 'back_matter': [], 'bib_entries': {'BIBREF0': {'title': 'The possible macroeconomic effect on the UK of an influenza pandemic', 'authors': [], 'year': 2009, 'venue': '', 'volume': '', 'issn': '', 'pages': None, 'other_ids': {'DOI': []}}}} path_file = "/Users/youssefbencheikh/Desktop/ABC00001.json" class Test_utils(unittest.TestCase): def test_isnumber(self): self.assertEqual(is_number(12), True, "Should be True") self.assertEqual(is_number("Covid"), False, "Should be False") def test_read_json(self): self.assertEqual(read_json(path_file), dic, "Should be a dictionnary") data = ExtractData(dic) class Test_extract_data(unittest.TestCase): def test_get_paper_id(self): self.assertEqual(data.get_paper_id(), 'ABC00001', "Should be 'ABC00001'") def test_get_title(self): self.assertEqual(data.get_title(), 'covid', "Should be Covid") def test_get_text(self): self.assertEqual(data.get_text(), 'Covid is coron virus', "Should be Covid is coron virus") preprocess = PreprocessData("test") class Test_preprocess_data(unittest.TestCase): def test_remove_number(self): self.assertEqual(preprocess.remove_number("hello number 1 and 2"), 'hello number and', "Should be 'hello number and'") def test_lower_case(self):
def requires(self): return ExtractData() # tarea(s) de las que depende el Filter
def test_extract_data(self): from extract_data import ExtractData e = ExtractData(yaml_file="../ICML2011.yaml") e.extract_data()
# -*- coding: utf-8 -*- from extract_data import ExtractData parser = ExtractData() parser.ExtractPlanSpeeds() parser.ExtractOdomSpeeds() def parse(filename): error = open(filename) V = [] W = [] for line in error: data = line.split(",") V.append(float(data[0])) W.append(float(data[1])) return V, W v, w = parse("plan_speeds.csv") vr, wr = parse("odom_speeds.csv") import pylab t = [i * 0.2 for i in range(0, len(v))] pylab.plot(t, v) pylab.plot(t, vr) pylab.grid(True) fig = pylab.figure() pylab.plot(t, w) pylab.plot(t, wr) pylab.grid(True) pylab.show()