def read(self, fn): d = defaultdict(lambda: []) with open(fn) as fin: data = json.load(fin) for sentence in data: tuples = data[sentence] for t in tuples: if t["pred"].strip() == "<be>": rel = "[is]" else: rel = t["pred"].replace("<be> ", "") confidence = 1 curExtraction = Extraction(pred=rel, head_pred_index=None, sent=sentence, confidence=float(confidence), index=None) if t["arg0"] != "": curExtraction.addArg(t["arg0"]) if t["arg1"] != "": curExtraction.addArg(t["arg1"]) if t["arg2"] != "": curExtraction.addArg(t["arg2"]) if t["arg3"] != "": curExtraction.addArg(t["arg3"]) if t["temp"] != "": curExtraction.addArg(t["temp"]) if t["loc"] != "": curExtraction.addArg(t["loc"]) d[sentence].append(curExtraction) self.oie = d
def __init__(self, cam_type='zed', realtime=False): if cam_type is 'zed': odom_file = '/export/patraval/robo_car_new_loop_all/zed_front/gps/fix.txt' self.initial_offset = 6070 self.offset_length = 6634 elif cam_type is 'pg': # odom_file = '/export/patraval/robo_car_loop2/pg_cam/gps/fix.txt' odom_file = '/export/patraval/robo_loop_pg_only/pg_cam/gps/fix.txt' self.initial_offset = 5953 self.offset_length = 6522 self.initial_offset = 1600 # pg #1417 #-- zed # self.initial_offset = 1948 self.offset_length = 6000 # self.initial_offset = 3510 # self.offset_length = 6634 print(self.initial_offset, self.offset_length) self.transformer = State_Transition(odom_file, cam_type, self.initial_offset, realtime) self.extractor = Extraction(cam_type)
def read(self, fn): """ Read a tabbed format line Each line consists of: sent, prob, pred, arg1, arg2, ... """ d = {} ex_index = 0 with open(fn) as fin: for line in fin: if not line.strip(): continue data = line.strip().split('\t') try: text, confidence, rel = data[:3] except: continue curExtraction = Extraction( pred=rel, head_pred_index=None, sent=text, confidence=float(confidence), question_dist= "./question_distributions/dist_wh_sbj_obj1.json", index=ex_index) ex_index += 1 for arg in data[3:]: curExtraction.addArg(arg) d[text] = d.get(text, []) + [curExtraction] self.oie = d
def __init__(self): self.icp = ICP() self.ekf = EKF() self.extraction = Extraction() # odom robot init states self.robot_x = rospy.get_param('/icp/robot_x',0) self.robot_y = rospy.get_param('/icp/robot_y',0) self.robot_theta = rospy.get_param('/icp/robot_theta',0) self.sensor_sta = [self.robot_x,self.robot_y,self.robot_theta] self.isFirstScan = True self.src_pc = [] self.tar_pc = [] # State Vector [x y yaw] self.xOdom = np.zeros((3,1)) self.xEst = np.zeros((3,1)) self.PEst = np.eye(3) # map observation self.obstacle = [] # radius self.obstacle_r = 10 # init map self.updateMap() # ros topic self.laser_sub = rospy.Subscriber('/course_agv/laser/scan',LaserScan,self.laserCallback) self.location_pub = rospy.Publisher('ekf_location',Odometry,queue_size=3) self.odom_pub = rospy.Publisher('icp_odom',Odometry,queue_size=3) self.odom_broadcaster = tf.TransformBroadcaster() self.landMark_pub = rospy.Publisher('/landMarks',MarkerArray,queue_size=1)
def requires(self): """ Depends on list of Extraction tasks. """ for url in self.urls: yield Extraction(url)
def __init__(self, nodeName="slam_ekf"): super(SLAM_Localization, self).__init__(nodeName) self.icp = SubICP() self.extraction = Extraction() self.isFirstScan = True self.laser_count = 0 # interval self.laser_interval = 5 # State Vector [x y yaw].T, column vector. # self.xOdom = np.zeros((STATE_SIZE,1)) self.xEst = np.zeros((STATE_SIZE, 1)) # Covariance. Initial is very certain. self.PEst = np.zeros((STATE_SIZE, STATE_SIZE)) # landMark Estimation. Like former self.tar_pc self.lEst = np.zeros((LM_SIZE, 0)) # lEst should be of 2*N size # ros topic self.laser_sub = rospy.Subscriber('/course_agv/laser/scan', LaserScan, self.laserCallback) # self.location_pub = rospy.Publisher('ekf_location',Odometry,queue_size=3) ## localization parameters # minimum landmark matches to update. self.min_match = int(rospy.get_param('/slam/min_match', 2)) # minimum number of points for a landmark cluster self.extraction.landMark_min_pt = int( rospy.get_param('/slam/landMark_min_pt', 1)) # maximum radius to be identified as landmark self.extraction.radius_max_th = float( rospy.get_param('/slam/radius_max_th', 0.4)) OBSTACLE_RADIUS = 0.35
def setUpClass(cls): cls.extraction_task_1 = Extraction(url=TEST_URL_1) cls.extraction_task_2 = Extraction(url=TEST_URL_2) cls.saving_task = Saving([TEST_URL_1, TEST_URL_2]) cls.saving_task.input = lambda: [ cls.extraction_task_1.output(), cls.extraction_task_2.output() ] cls.cmd("hadoop fs -rm -r /{app_name}".format( app_name=APPLICATION_NAME).split()) cls.cmd("hadoop fs -mkdir -p /{app_name}/{ex_out}".format( app_name=APPLICATION_NAME, ex_out=EXTRACTION_OUTPUT).split()) cls.cmd( "hadoop fs -put test-resources/https--en.wikipedia.org-wiki-Battle_of_Austerlitz.html /{app_name}/{ex_out}" .format(app_name=APPLICATION_NAME, ex_out=EXTRACTION_OUTPUT).split()) cls.cmd( "hadoop fs -put test-resources/https--en.wikipedia.org-wiki-Napoleon.html /{app_name}/{ex_out}" .format(app_name=APPLICATION_NAME, ex_out=EXTRACTION_OUTPUT).split())
def GET(self, datafile, method): params = web.input(output="xml") """Returns some extracted information from a file""" extractor = Extraction() data = '' txtfile = TMP_FOLDER + datafile + '.txt' """Check if the file exists, if not return a 404""" if not os.path.exists(txtfile): return web.notfound() try: if method == 'text': txtfile = TMP_FOLDER + datafile + '.txt' web.header('Content-Type', 'text/text') # Set the Header return open(txtfile,"rb").read() elif method == 'file': pdffile = TMP_FOLDER + datafile typeFilterStatus = utilities.typeFilter(pdffile) web.header('Content-Type', typeFilterStatus) # Set the Header return open(pdffile,"rb").read() else: if method == 'header': data = data + extractor.extractHeaders(txtfile) elif method == 'citations': data = data + extractor.extractCitations(txtfile) elif method == 'body': data = data + extractor.extractBody(txtfile) elif method == 'keyphrases': data = data + extractor.extractKeyphrases(txtfile) #Print XML or JSON if params.output == 'xml' or params.output == '': web.header('Content-Type','text/xml; charset=utf-8') return utilities.printXML(data) elif params.output == 'json': jsondata = xmltodict.parse(data) web.header('Content-Type','text/json; charset=utf-8') return json.dumps(jsondata) else: web.ctx.status = '400' return 'Unsupported output format. Options are: "xml" (default) and "json"' except (IOError, OSError) as er: #Internal error, i.e. during extraction web.debug(er) return web.internalerror()
def read(self, fn): d = defaultdict(lambda: []) with open(fn) as fin: for line_ind, line in enumerate(fin): data = line.strip().split('\t') text, rel = data[:2] args = data[2:] confidence = 1 curExtraction = Extraction(pred=rel, head_pred_index=None, sent=text, confidence=float(confidence), index=line_ind) for arg in args: curExtraction.addArg(arg) d[text].append(curExtraction) self.oie = d
def read(self, fn): d = {} with open(fn) as fin: for line in fin: data = line.strip().split('\t') if len(data) >= 4: arg1 = data[3] rel = data[2] arg_else = data[4:] confidence = data[1] text = data[0] curExtraction = Extraction(pred=rel, head_pred_index=-1, sent=text, confidence=float(confidence)) curExtraction.addArg(arg1) for arg in arg_else: curExtraction.addArg(arg) d[text] = d.get(text, []) + [curExtraction] self.oie = d
def __init__(self): # ros param self.robot_x = rospy.get_param('/slam/robot_x',0) self.robot_y = rospy.get_param('/slam/robot_y',0) self.robot_theta = rospy.get_param('/slam/robot_theta',0) ## ros param of mapping self.map_x_width = rospy.get_param('/slam/map_width') self.map_y_width = rospy.get_param('/slam/map_height') self.map_reso = rospy.get_param('/slam/map_resolution') self.map_cellx_width = int(round(self.map_x_width/self.map_reso)) self.map_celly_width = int(round(self.map_y_width/self.map_reso)) self.icp = ICP() self.ekf = EKF() self.extraction = Extraction() self.mapping = Mapping(self.map_cellx_width,self.map_celly_width,self.map_reso) # odom robot init states self.sensor_sta = [self.robot_x,self.robot_y,self.robot_theta] self.isFirstScan = True self.src_pc = [] self.tar_pc = [] # State Vector [x y yaw] self.xOdom = np.zeros((STATE_SIZE, 1)) self.xEst = np.zeros((STATE_SIZE, 1)) self.PEst = np.eye(STATE_SIZE) # map observation self.obstacle = [] # radius self.obstacle_r = 10 # ros topic self.laser_sub = rospy.Subscriber('/course_agv/laser/scan',LaserScan,self.laserCallback) self.location_pub = rospy.Publisher('ekf_location',Odometry,queue_size=3) self.odom_pub = rospy.Publisher('icp_odom',Odometry,queue_size=3) self.odom_broadcaster = tf.TransformBroadcaster() self.landMark_pub = rospy.Publisher('/landMarks',MarkerArray,queue_size=1) self.map_pub = rospy.Publisher('/slam_map',OccupancyGrid,queue_size=1)
def __init__(self, nodeName="ekf_icp"): super(EKF_Landmark_Localization, self).__init__(nodeName) self.icp = SubICP() self.extraction = Extraction() self.src_pc = None self.isFirstScan = True self.laser_count = 0 # interval self.laser_interval = 5 # State Vector [x y yaw].T, column vector. self.xEst = np.zeros((STATE_SIZE, 1)) # Covariance. Initial state is certain. # self.PEst=np.eye(STATE_SIZE) self.PEst = np.zeros((STATE_SIZE, STATE_SIZE)) # init map # map observation self.tar_pc = None self.updateMap() # ros topic self.laser_sub = rospy.Subscriber('/course_agv/laser/scan', LaserScan, self.laserCallback) # self.location_pub = rospy.Publisher('ekf_location',Odometry,queue_size=3) # parameters from launch file. # minimum landmark matches to update. Actually even 1 point is acceptable. self.min_match = int(rospy.get_param('/localization/min_match', 1)) # minimum number of points for a landmark cluster self.extraction.landMark_min_pt = int( rospy.get_param('/localization/landMark_min_pt', 2)) # maximum radius to be identified as landmark self.extraction.radius_max_th = float( rospy.get_param('/localization/radius_max_th', 0.4))
from extraction import Extraction import time startTime = time.time() print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(startTime)) extract1 = Extraction() extract1.loadData() extract1.run() endTime = time.time() print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(endTime)) seconds = endTime - startTime print seconds m, s = divmod(seconds, 60) h, m = divmod(m, 60) print "%d:%02d:%02d" % (h, m, s)
from flask import Flask, render_template, request, jsonify import sentences_list import random from extraction import Extraction from parssing import Parssing import wikipedia app = Flask(__name__) app.config['SECRET_KEY'] = '7TKUKe09wW1PlrtSL066lsN18uWA7iuO' # Instances creation datas_extraction = Extraction() datas_management = Parssing() @app.route('/') def home(): return render_template('index.html', title="Bienvenue chez GrandPy Bot") @app.route('/_answer') def answer(): question = request.args.get('question', 0, type=str) return jsonify(result=question) @app.route('/_address') def address(): question = request.args.get('question', 0, type=str) filtered_sentence = Parssing.get_main_words(datas_management, question)
set_logger() if EXTRACTION: """ The extraction phase involves the process of obtaining a set of documents from a repository, such as Scopus or the Web of Science, or it can involve the steps of scraping a publisher’s website to retrieve full-text articles (typically in PDF format). Scopus generally provides publication abstracts, including all the meta-data (journal, authors, affiliations, publication date) through various APIs. The upside of using an API is that publication content is easily obtained for a large number of documents simultaneously, however, these APIs often do not provide full-text for all the publications or journals of interest. In these cases, scraping a publisher’s websites can be an alternative solution. This process involves building many handcrafted crawlers, as each publisher lists their publications in a different manner on their website. Download limits should always be respected when building such scripts. Another option would be to manually download articles, although such approaches might not be feasible if the document collection of interest contains thousands or tens of thousands of articles. To enable a comparison of topics by time, or a comparison of topics by journals, it is important to store this information alongside the content of the document. """ # instantiate Extraction class extraction = Extraction() # extract publications from NIPS website extraction.extract_publications() if PREPROCESSING: """ The pre-processing phase can be seen as the process of going from a document source to an interpretable representation for the topic model algorithm. This phase is typically different for full-text and abstract data. One of the main differences is that abstract data is often provided in a clean format, whereas full-text is commonly obtained by converting a PDF document into its plain text representation. Within this phase, an important part is to filter out the content that is not important from a topic model's point-of-view, rather than from a human’s point-of-view. Abstract data usually comes in a clean format of around 300--400 words, and little additional text is added to it; typically the copyright statement is the only text that should be removed. In contrast, full-text articles can contain a lot of additional text that has been added by the publisher. This is article meta-data and boilerplate. It is important that such additional text is removed, and various methods to do so exist. Examples include: deleting the first cover page; deleting the first n-bits of the content; using regular expressions or other pattern matching techniques to find and remove additional text, or more advanced methods. For full-text articles, a choice can be made to also exclude the reference
if __name__ == "__main__": print "This is a script to help extract audio features and to calculate distances between songs with choice parameters." print "All data is stored in a database of MongoDB." print "The script only works with wave files, being named in a way described later. You can give mp3 or m4a files (ID3 taged!) " print "to the script and it will convert them towave files with ffmpeg for you. Make sure ffmpeg and mutagen are installed." print "For audio feature extraction, this script is using the bregman audio toolbox and aubio toolbox (for the rhythm) " print "which you have to download and install (make sure both work)." print "There is also a set of prerequisits for python: you should have installed numpy, etc." #update all python packages that have to be installed print "__________________________________________" print "At any time, press q if you want to quit." print "__________________________________________" y = None while (y != "q"): message = "What do you want to do? File conversion (c), feature extraction (e) or distance calculation(d)? : " y = raw_input(message) if y == "c": c = Conversion() elif y == "e": e = Extraction() elif y == "d": d = Distance() elif y == "q": print "Goodbye!" else: print "There is no %r option, please try again." %(y)
#!/usr/bin/env python # -*- coding: utf-8 -*- from conversion import Conversion from extraction import Extraction from distance import Distance from sys import argv if __name__ == "__main__": y = argv[1] if "-c" in y: c = Conversion(y) elif "-e" in y: e = Extraction(y) elif y == "-d": d = Distance() else: print "There is no %r option. Try -cmp3 -cm4a -e or -d." % (y)
song_features = db.song_features_collection # #####REPLACE##### distance_features = db.distance_features_collection # #####REPLACE##### #__________________________________________________________________________ options, rem = getopt(argv[1:], 'c:e:d:n:g:h', [ 'conversion=', 'extraction=', 'distance', 'neighbour', 'graphdist', 'help' ]) for opt, arg in options: if opt in ('-c', '--conversion'): from conversion import Conversion c = Conversion(arg, rem[0]) elif opt in ('-e', '--extraction'): from extraction import Extraction e = Extraction(arg, song_features) elif opt in ('-d', '--distance'): from distance import Distance d = Distance(song_features, distance_features) elif opt in ('-n', '--neighbour'): from neighbour import Neighbour n = Neighbour(song_features, distance_features) elif opt in ('-g', '--graphdist'): from graphdist import Graphdist g = Graphdist(song_features, distance_features) elif opt in ('-h', '--help'): print """The following options are available: -c, --conversion mp3/m4a => conversion of mp4 and mp3 files to wave files -e, --extraction