Example #1
0
	def install(self):
		# Check if package installed
		db = hpakDB(self.pkg_name)
		if db.get_value("status") == "installed":
			misc.print_error("%s - already installed!" % (self.pkg_name), False)
			return
							
		self.prepare_install()
		dl = download(self.options['source'], self.pkg_path, self.pkg_name)
		dl.get()
		
		# Extracting the file.
		e =	Extractor(self.options)
		e.extract()

		# Install depends
		self.install_dep()

		Cmds = self.options['install'].split(',')
		for cmd in Cmds:
			subprocess.Popen(cmd, shell=True).wait()

		# Verify package installed.
		if os.path.exists("%s/%s" % (HPAK_ROOT, self.options['dir'])):
			db = hpakDB(self.pkg_name)
			db.set_value("status", "installed")
			misc.print_success("%s installed." % (self.pkg_name))
		else:
			misc.print_error("%s-%s NOT installed, please try again." % (self.pkg_name, self.options['version']), True) 
Example #2
0
 def process(self,file):
     feats = {}
     Extractor.process(self,file)
     ir = InputReader(file)
     ir.read()
     cqpf = CQPFormat(ir.getText())
     pos = cqpf.getColumn(1)
     # initialize counts
     
     for name in self.posnames:
         feats[name] = 0
     
     for i in range(2,len(pos)): # ignore first two pos ...
         uni =  (pos[i])[0:3]
         bi = (pos[i-1])[0:3] + "_" + uni
         tri = (pos[i-2])[0:3] + "_" + bi
         if uni in feats:
             feats[uni] += 1
         if bi in feats:
             feats[bi] += 1
         if tri in feats:
             feats[tri] += 1
         
     for x in self.posnames:
         feats[x] /= float(len(pos)-2)
     
     return ir.getID(),feats
class ExtractorTest(unittest.TestCase):

    def setUp(self):
        example = "LOQ75625Team LOQ            49% blend std 8mm       21-JUN-2000 12:55:23    30.0"
        self.extractor = Extractor()
        self.extractor.extract_data(example)

    def tearDown(self):
        pass

    def test_extract_valid_instrument_name(self):
        self.assertEqual("LOQ", self.extractor.instrumentname)

    def test_extract_valid_run_number(self):
        self.assertEqual("75625", self.extractor.runnumber)

    def test_extract_valid_run_username(self):
        self.assertEqual("Team LOQ", self.extractor.username)

    def test_extract_valid_run_experimenttitle(self):
        self.assertEqual("49% blend std 8mm", self.extractor.experimenttitle)

    def test_extract_valid_run_startdate(self):
        self.assertEqual("21-JUN-2000", self.extractor.startdate)

    def test_extract_valid_run_starttime(self):
        self.assertEqual("12:55:23", self.extractor.starttime)

    def test_extract_valid_run_charge(self):
        self.assertEqual("30.0", self.extractor.charge)
Example #4
0
class Framework:
    """"""

    def __init__(self, classifier):
        """"""
        self.classifier = classifier
        self.extractor = Extractor()

    def _create_subjects(self):
        """"""
        return [Subject(FILE_NAMES['NormROIS'] % (file_index + 1)) 
                    for file_index in range(NUM_OF_SUBJECTS)]

    def _train(self, classifier, features):
        """"""
        classifier.train(features)

    def _classify(self, classifier):
        """"""
        classifier.classify()

    def execute(self):
        """"""
        # 1) Load the data files
        subjects = self._create_subjects()
        # 2) Extract the features
        self.extractor.extract_features(subjects)
        print len(self.extractor.features['P']), exit()
        # 3) Train the classifier
        self._train(self.classifier, self.extractor.features)
        # 4) Classify some data
        self._classify(self.classifier)
Example #5
0
 def test_cond(self):
     from masks import mask
     e = Extractor()
     logging.debug(e)
     e.add_feature_condition(mask)
     res = e.extract(self.data)
     self.assertTrue(len(res[self.data.keys()[0]]) > 0)
	def generateFeatures(self):
		'''
		Has been hardcoded for wikipedia
		For each category, fetch Wiki-pages from list.txt
		Store keywords (links in the specified section)in features.txt
		'''
		e = Extractor()
		print self.categories
		for name in self.categories:
			print name
			f = open("%s/%s/%s" % (self.config.get(self.section, "CLASSES_FILE"), name, self.config.get(self.section, "LIST_FILE")), "r")
			g = open("%s/%s/%s" % (self.config.get(self.section, "CLASSES_FILE"), name, self.config.get(self.section, "FEATURE_FILE")), "w")
			for page in f:
				print page
				pagetok = page.strip().split('\t')
				try: section = pagetok[1]
				except: section = 0
				links = e.getWikiLinks(pagetok[0], section = section)
				for feature in links:
					units = set(self.clean(feature).split('_'))
					for unit in units:
						unit = self.stemmer.stem(unit)
						if self.valid(unit):
							g.write("%s," % unit)
				g.write("\n")
			f.close()
			g.close()
Example #7
0
 def __init__(self):
     self.__featureNames = sorted(
         [name for (name, re) in self.DIRECT_FEATS]
         + [name for (name, re) in self.LEMMA_FEATS]
         + self.CALCULATED_FEATS
     )
     Extractor.__init__(self)
Example #8
0
    def test_extractorResultGetData(self):
        strategy = mock.MagicMock()
        strategy.get_data.return_value = {"success": True}
        extractor = Extractor(strategy)
        result = extractor.get_result()

        self.assertTrue(result.get_data()["success"])
Example #9
0
def ext_json():
    rdfUrl = ''
    tok = Tokenizer()
    if request.method == 'POST':
        rdf = request.form['data']
        status_test = "0"#request.form['status']
        filters = ""#request.form['exculdeurls']
        #rdf = "http://jpp.no-ip.org/MAD_J.rdf"
        try:
            #r = requests.get(rdf)
            gg = Graph()
            #g.load(rdfUrl)
            rdf_content = StringIO.StringIO(rdf.encode('utf-8'))
            #print rdf_content.readline()
            gg.parse(rdf_content,  format="xml")
            ext = Extractor(gg)
            uris = ext.getUris()
            mapping = MapFactory()
            for uri in uris:
                term = tok.tokenized_url(uri)
                uri_status = ""
                if status_test == "1":
                    uri_status = ext.testUri(uri)
                else:
                    uri_status = "N/A"  
                uri_lookup = str(uri)+"\"" 
                lnum = ext.get_lines(rdf_content, uri_lookup)          
                ent = MapEntry(uri, term, "", lnum, uri_status)
                mapping.add(ent)
            jsonized_result = json.dumps(mapping.get())              
            return Response(jsonized_result, mimetype='application/json')
        except requests.exceptions.ConnectionError:
            X2Rwarning = 'X2R Warning: The requested URL raises ConnectionError~!!!'
            return X2Rwarning
Example #10
0
    def test_extractorResultGetJson(self):
        strategy = mock.MagicMock()
        strategy.get_data.return_value = {"success": True}
        extractor = Extractor(strategy)
        result = extractor.get_result()

        self.assertEqual('{"success": true}', result.get_json())
Example #11
0
def predict(article_link, image_link):
    """
    output: predicted emotion as: [ 0.  1.  0.  0.  0.]
    """
    e = Extractor()
    user_input = {
        "article_link": article_link,
        "image_link": image_link
    }

    friendly_json = e.user_extract(user_input)

    tax_list = friendly_json['alchemy']['taxonomy']
    tax_primary = []
    for t in tax_list:
        tax_primary.append(t['label'].split('/')[1])

    tax_primary = list(set(tax_primary))[0]

    extracted_articles = dict()
    extracted_articles['articles'] = [friendly_json]
    textEmotions = text_emotions_x(extracted_articles)
    picEmotions = picture_emotions_x(extracted_articles)

    with open('emotionClassification/trained_models/bbac_1150_all_clf.pkl','r') as f:
        clf = cPickle.load(f)

    test_article = makeDataMatrix(textEmotions, picEmotions)

    reaction = predictReactions(clf, test_article)

    return reaction[0], tax_primary
Example #12
0
 def process(self,file):
     feats = {}
     Extractor.process(self,file)
     ir = InputReader(file)
     ir.read()
     cqpf = CQPFormat(ir.getText())
     #words = ' '.join(cqpf.getColumn(0))
     #pos = ' '.join(self.disambiguatePOS(cqpf.getColumn(1)))
     lemma = cqpf.getColumn(2)
     sentences = cqpf.getAnnotations("s")
     wordpostmp = []
     for (start,end,attr) in sentences:
         wordpostmp.append('<s>')
         wordpostmp.extend(self.getWordsWithPOS(
                             cqpf.getColumn(0)[start:end],
                             self.disambiguatePOS(cqpf.getColumn(1)[start:end])))
         wordpostmp.append('</s> ')
     wordpos = ' '.join(wordpostmp)
     feats.update(self.extractWithREs(self.DIRECT_FEATS,wordpos))
     feats.update(self.extractWithREs(self.CALC_FEATS,wordpos))
     feats.update(self.extractFromLemmatatizedForms(self.LEMMA_FEATS,lemma))
     self.calculateFeats(feats)
     self.normalizeByLength(feats, len(lemma))
     feats.update(self.extractStatistics(cqpf))
     print feats
     return ir.getID(),feats
Example #13
0
	def add_synset(self, word):
		ex = Extractor()
		word_id = md5.md5(word).hexdigest()
		if not self.fdb.get(word_id):
			self.fdb.set(ROOT + word_id, word)
		synset = ex.getWikiBacklinks(word)
		if synset:
			for synonym in synset:
				self.fdb.set(SYN + synonym.upper(), word_id)
Example #14
0
 def test_monotony(self):
     from masks import absolute_monotony as monotony
     e = Extractor()
     logging.debug(e)
     e.add_feature_condition(monotony.Raising)
     e.add_feature_condition(monotony.Falling)
     res = e.extract(self.data)
     logging.debug("res: \n%s", pprint.pformat(res))
     self.assertTrue(len(res[self.data.keys()[0]]) > 0)
Example #15
0
 def process(self,file):
     feats = {}
     Extractor.process(self,file)
     ir = InputReader(file)
     ir.read()
     cqpf = CQPFormat(ir.getText())
     lengths = [end-start for (start,end,arg) in cqpf.getAnnotations("s")]
     print self.__featureNames
     feats = utils.getStats("SENT_LENGTH", lengths)
     return ir.getID(),feats
Example #16
0
	def add_disambiguation(self, a):
		ex = Extractor()
		ls = ex.getDisambiguationLinks(a + '_(disambiguation)')
		if ls:
			anode = self.graphdb.get_or_create_indexed_node(self.DISAMBIGUATION, 'name', a, {'name': a, 'class': self.DISAMBIGUATION})
			for l in ls:
				print "disambiguation link:", l
				lnode = self.graphdb.get_indexed_node('NODE', 'name', l)
				if lnode:
					print "creating disamb relation betn", a, ", ", l
					self.graphdb.create((anode, self.DISAMBIGUATION, lnode, {'class': self.DISAMBIGUATION, 'weight': 1}))
Example #17
0
def extract(args, task, crawler_list):
    """runs every crawler in crawler_list"""
    # init and start feature extractor
    logging.debug("Extracting")
    feature_extractor = Extractor()

    # FEATURE LIST:
    cli_mask_groups = parse_arg_range(args.extraction_masks, type_=str)
    all_masks = get_all_masks(cli_mask_groups + task.mask_groups)
    feature_extractor.add_feature_masks(all_masks)

    # data storage paths
    extractor_stream(task, feature_extractor, crawler_list)
    logging.debug("done extracting")
Example #18
0
    def __init__(self, filename):
        super(Database, self).__init__()
        Extractor.extract(filename)

        lines = ''
        with open(filename, 'r') as f:
            lines = f.readlines()

        self.courses = list()

        for i in lines:
            x = i.split(',')
            x = [y.strip('()"') for y in x]
            self.courses.append(Course(x[0],x[1],x[2],x[3],x[4],x[5],x[6]))
Example #19
0
class Teacher:
    def __init__(self):
        self.model = SongModel()
        self.extractor = Extractor()

    def parse_set(self):
        content = []
        with open("training/Tracks/ground_truth.csv") as f:
            for l in f:
                l = l.replace('\"', '').replace('\n', '')
                name = ""
                genre = ""
                flag = 0
                for c in l:
                    if c == ',':
                        flag = 1
                    elif flag == 0:
                        name += c
                    elif flag == 1:
                        genre += c
                content.append([name, genre])
        return content

    def train(self):
        for item in self.parse_set():
            self.extractor.set_song(item[0])
            tempo = self.extractor.get_tempo()
            rolloffmoy = self.extractor.get_rolloff_moy()
            rolloffect = self.extractor.get_rolloff_ect()
            zcrmoy = self.extractor.get_zcr_moy()
            zcrect = self.extractor.get_zcr_ect()
            duration = self.extractor.get_duration()
            self.model.add(item[0], item[1], tempo, rolloffmoy, rolloffect, zcrmoy, zcrect, duration)
            print("ADDED : " + item[0] + " " + item[1] + " " + str(tempo) + " " + str(rolloffmoy) + " " + str(rolloffect) + " " + str(zcrmoy) + " " + str(zcrect) + " " + str(duration))
        print("DONE")
Example #20
0
    def __init__(self, parent):
        Frame.__init__(self, parent)   
         
        self.parent = parent 

        self.music_root = ''
        self.query_path = ''
        self.extractor = Extractor(n_frames=40, 
                                   n_blocks=100, 
                                   learning_rate=0.00053,
                                   verbose=True)

        self.style = Style()
        self.style.theme_use("default")
        
        padx = 2
        pady = 2

        root_select_button = Button(self, text="Select a directory")
        root_select_button.pack(fill=tkinter.X, padx=padx, pady=pady)
        root_select_button.bind("<Button-1>", self.set_music_root)

        analyze_button = Button(self, text="Analyze")
        analyze_button.pack(fill=tkinter.X, padx=padx, pady=pady)
        analyze_button.bind("<Button-1>", self.analyze)

        query_select_button = Button(self, text="Select a file")
        query_select_button.pack(fill=tkinter.X, padx=padx, pady=pady)
        query_select_button.bind("<Button-1>", self.set_query_path)

        search_button = Button(self, text="Search similar songs")
        search_button.pack(fill=tkinter.X, padx=padx, pady=pady)
        search_button.bind("<Button-1>", self.search_music)
 
        self.pack(fill=BOTH, expand=1)
Example #21
0
 def __init__(self):
     self.extractor = Extractor()
     self.sqs = boto3.client('sqs')
     self.queue_url = 'https://sqs.ap-southeast-1.amazonaws.com/841662669278/crawler'
     self.s3 = boto3.client('s3')
     self.dynamodb = boto3.resource('dynamodb')
     self.bloom_filter = MyBloomFilter(self.dynamodb.Table('link'))
Example #22
0
class IndexTrainer(object):

	def __init__(self):
		self.index = InvertedIndex()
		self.bow = Bow()
		self.extractor = Extractor('surf')
		print self.index.author
		print self.index.description

	def load_feature(self, path='../models/feature.npy'):
		self.features = np.load(path)
		if len(self.features) > 200000:
			self.features = self.features[:200000]
		print "feature shape: ", self.features.shape
		return self.features

	def run(self, path):
		self.bow.load()
		self.index.reset(self.bow.centers)
		images = imutil.get_list_image(path)
		t = imutil.Timer(1)
		t.tic()
		for i,image in enumerate(images):
			descriptors = self.extractor.extract(image)
			self.index.append(image, descriptors)
			if (i+1)%1000 == 0:
				t.toc('finish 1000 images: ')
				t.tic()
Example #23
0
def extract_multi(args):
  # Extract  Featues from Net using prot_file from images from input file
  # it will save patch of max_value features at one .cPickle. 
  # Not as one file, because 2.4M images produce 10 GB of data
  pred = Extractor(args.proto_path,args.bin_path)
  max_value = 512 
  curr_value = 0
  list_all_result = list()
  list_good_class_all = list()
  list_name_file = list()
  create_dir(args.folder)
  with open(args.images,'r') as file_image:
    list_images = list()
    list_good_class = list()
    for idx,line in enumerate(file_image):
      splitted = line.split(' ')
      list_good_class.append(int(splitted[1]))
      list_images.append(splitted[0].strip())
      curr_value = curr_value + 1
      if curr_value < max_value:
        continue
      else:
        #predict using value
        predictions = pred.predict_multi(list_images)
        f = Feature(predictions,list_good_class)
        name = '/'.join((args.folder,str(idx)+"_file.cPickle"))
        list_name_file.append(os.path.abspath(name))
        save_cPickle(f,name)
        list_good_class = list()
        list_images = list()
        curr_value = 0
        print "Predicted 512"
        
    #predict last package of data, which is smaller than max_value
    if len(list_images) > 0:
      predictions = pred.predict_multi(list_images)
      list_all_result.append(predictions)
      f = Feature(predictions,list_good_class)
      name = '/'.join((args.folder,str(idx)+"_file.cPickle"))
      save_cPickle(f,name)
      list_name_file.append(os.path.abspath(name))
      
  f = open(args.folder+ '/' + 'files.txt', 'wb')
  f.writelines( "%s\n" % item for item in list_name_file)
  f.close()
Example #24
0
	def getWikiDist(self, a, b):
		a = a.replace(' ', '_')
		b = b.replace(' ', '_')
		e = Extractor()
		sa = e.getWikiBacklinks(a, filter = "nonredirects")
		sb = e.getWikiBacklinks(b, filter = "nonredirects")
		n1 = log(max(len(sa), len(sb)))
		n2 = log(len(set.intersection(sa, sb)))
		d1 = log(10 ** 7)
		d2 = log(min(len(sa), len(sb)))
		extra1 = extra2 = 0
		#if a in sb: extra1 = log(10 ** 7 / len(sb))
		#if b in sa: extra2 = log(10 ** 7 / len(sa))
		try:
			return (n1 - n2) / float(d1 - d2)
		except ZeroDivisionError as e:
			print e
			return self.INF
Example #25
0
class Extraktor(object):
    def __init__(self):
        self.extractor = Extractor()
        self.sqs = boto3.client('sqs')
        self.queue_url = 'https://sqs.ap-southeast-1.amazonaws.com/841662669278/crawler'
        self.s3 = boto3.client('s3')
        self.dynamodb = boto3.resource('dynamodb')
        self.bloom_filter = MyBloomFilter(self.dynamodb.Table('link'))

    def process(self):
        while True:
            ret = self.sqs.receive_message(
                QueueUrl=self.queue_url,
                MaxNumberOfMessages=10,
                WaitTimeSeconds=1
            )

            if 'Messages' not in ret:
                continue
            
            for msg in ret['Messages']:
                key = msg['Body']
                record = self.s3.get_object(Bucket='samuel-html', Key=key)
                #pack['Body'] botocore.response.StreamingBody
                pack = json.loads(lzo.decompress(record['Body'].read()).decode('utf-8'))
            #    response = self.client.delete_message(
            #        QueueUrl=self.queue_url,
            #        ReceiptHandle=msg['ReceiptHandle']
            #    )
            #    print(response)

                self.bloom_filter.add(pack['url'])
                if pack.get('code') == 200:
                    url = pack['url']
                    ret = self.extractor.extract(pack)
                    for link in ret['links']:
                        if not self.bloom_filter.add(link['url']):
                            seed(link)
                        else:
                            #print 'already crawled', link['url']
                            pass
                    #save pack to tbl_link
                    self.dynamodb.Table('link').put_item(
                        Item = {
                            'url': url,
                            'ctime': Decimal(str(time.time())),
                            'utime': Decimal(str(time.time()))
                        }
                    )
                    logger.info("%s ok" % (pack['url']))
                else:
                    logger.warn("%s not ok code:%d" % (pack['url'], pack.get('code')))
                response = self.sqs.delete_message(
                    QueueUrl=self.queue_url,
                    ReceiptHandle=msg['ReceiptHandle']
                )
Example #26
0
	def extract_comments(self):
		if self.has_soup():
			comments = self.soup.find_all("div", class_="comment") or []
			for comment in comments:
				extractor = Extractor(comment)
				
				author = extractor.extract_comment_author_user_name()
				post_url = "" #this needs to be set with the post in scope
				date = extractor.extract_comment_date()
				score = extractor.extract_comment_score()
				body = extractor.extract_comment_body()

				self.comments.append(Comment(
					author=author,
					post_url=post_url,
					date=date,
					score=score,
					body=body
				))

		return self.comments
Example #27
0
 def __init__(self, song):
     self.song = song
     self.model = SongModel()
     self.extractor = Extractor()
     self.tempo = 0
     self.rolloffmoy = 0.0
     self.rolloffect = 0.0
     self.zcrmoy = 0.0
     self.zcrect = 0.0
     self.duration = 0.0
     self.genre = []
     for l in open("training/Tracks/genres.txt"):
         self.genre.append(l.replace('\n',''))
Example #28
0
def ext_result():
    rdfUrl = ''
    if request.method == 'POST':
        rdfUrl = request.form['url']
        try:
            r = requests.get(rdfUrl)
            #rdfUrl = str(r.status_code)
            g = Graph()
            #g.parse("MAD.rdf", format="xml")
            g.load(rdfUrl)
            ext = Extractor(g)
            uris = ext.getUris()
            terms = ext.terms()
            result = {}
            result['uris'] = uris
            result['terms'] = terms
            result['bNodes'] =str(len(ext.getBnodes()))
            result['uNodes'] = str(len(uris))
            return render_template('index.html', result= result)
        except requests.exceptions.ConnectionError:
            X2Rwarning = 'X2R Warning: The requested URL raises ConnectionError~!!!'
            return X2Rwarning
    def setUp(self):
        # open and parse model xml file
        self.fmodel = open("logs_new/model_log.xml")
        self.model_tree = ET.parse(self.fmodel)
        self.model_root = self.model_tree.getroot()
        # create sample experiment data
        example = "LOQ75625Team LOQ            49% blend std 8mm       21-JUN-2000 12:55:23    30.0"
        self.extractor = Extractor()
        self.extractor.extract_data(example)
        self.maxDiff = None

        self.xml_out = XMLOutputter("testlog")
        self.xml_out.write_line(self.extractor)
def main():
    downloader = Downloader()
    extractor = Extractor()
    url = "https://pornhub.com"

    puts(colored.green("getting video keys."))
    main_page = downloader.get(url)
    view_keys = extractor.get_viewkeys(main_page)

    puts(colored.green("starting to download videos."))
    for key in view_keys:
        puts(colored.green("getting video information."))
        absolute_url = "https://pornhub.com/view_video.php?viewkey=" + key
        page = downloader.get(absolute_url)
        info = extractor.get_video_info(page)

        if info is None:
            continue

        hd_quality = info['mediaDefinitions'][0]
        puts(colored.green("downloading video %s." % info['video_title']))
        downloader.save_file(hd_quality["videoUrl"], info['video_title'] + ".mp4")
Example #31
0
from extractor import Extractor
import LeaveMessage
import re
from getTime import *
from getType import *
from doAskForLeave import *
from stanfordcorenlp import StanfordCoreNLP
from getReason import get_reason
ex = Extractor()


# def get_type(sentence):
#     affairs = re.search(r'(.*)事(.*)假(.*).*', sentence, re.M | re.I)
#     sick = re.search(r'(.*)病(.*)假(.*).*', sentence, re.M | re.I)
#     marriage = re.search(r'(.*)婚(.*)假(.*).*', sentence, re.M | re.I)
#     if affairs:
#         return "事假"
#     elif sick:
#         return "病假"
#     elif marriage:
#         return "婚假"
#     else:
#         return None


def ask(message):
    if message.startDate is None and message.endDate is None and message.duration is None and message.type is None and message.examinePerson is None and message.email is None and message.reason is None:
        return "请输入请假时间等信息"

    if message.type is None:
        return "请输入请假类型"
    sys.exit(0)
yto_config = yto(sys.argv[1])

# Set defaults.
seq_length = yto_config.videoSeqLength
class_limit = None  # Number of classes to extract. Can be 1-101 or None for all.

# Get the dataset.
data = DataSet(seq_length=seq_length,
               class_limit=class_limit,
               repo_dir=yto_config.repoDir,
               feature_file_path=yto_config.featureFileName,
               work_dir=yto_config.workDir)

# get the model.
model = Extractor()

# Loop through data.
pbar = tqdm(total=len(data.data))
sequence_path = os.path.join(yto_config.workDir, 'sequences')
if not os.path.exists(sequence_path):
    print("Creating sequence folder [%s]", sequence_path)
    os.makedirs(sequence_path)
for video in data.data:

    # Get the path to the sequence for this video.
    path = os.path.join(sequence_path, video[2] + '-' + str(seq_length) + \
        '-features')  # numpy will auto-append .npy

    # Check if we already have it.
    if os.path.isfile(path + '.npy'):
Example #33
0
 def __init__(self):
     self.preprocessor = Preprocessor()
     self.extractor = Extractor()
     self.normalizer = Normalizer()
Example #34
0
def Extrair():
    print('Informe o Tempo de Processamento (em segudos): ', end='')
    tempo = float(input())
    extracao = Extractor(
        tempo
    )  # Faz conexão com a API do Twitter e extrai dados para banco de dados local.
class InteractivePredictor:
    exit_keywords = ['exit', 'quit', 'q']

    def __init__(self, config, model):
        model.predict([])
        self.model = model
        self.config = config
        self.path_extractor = Extractor(config,
                                        EXTRACTION_API,
                                        self.config.MAX_PATH_LENGTH,
                                        max_path_width=2)

    @staticmethod
    def read_file(input_filename):
        with open(input_filename, 'r') as file:
            return file.readlines()

    def predict(self):
        input_filename = 'Input.java'
        print('Serving')
        while True:
            print('Modify the file: "' + input_filename +
                  '" and press any key when ready, or "q" / "exit" to exit')
            user_input = input()
            if user_input.lower() in self.exit_keywords:
                print('Exiting...')
                return
            user_input = ' '.join(self.read_file(input_filename))
            try:
                predict_lines, pc_info_dict = self.path_extractor.extract_paths(
                    user_input)
            except ValueError:
                continue
            model_results = self.model.predict(predict_lines)

            prediction_results = Common.parse_results(model_results,
                                                      pc_info_dict,
                                                      topk=SHOW_TOP_CONTEXTS)
            for index, method_prediction in prediction_results.items():
                print('Original name:\t' + method_prediction.original_name)
                if self.config.BEAM_WIDTH == 0:
                    print('Predicted:\t%s' % [
                        step.prediction
                        for step in method_prediction.predictions
                    ])
                    for timestep, single_timestep_prediction in enumerate(
                            method_prediction.predictions):
                        print('Attention:')
                        print(
                            'TIMESTEP: %d\t: %s' %
                            (timestep, single_timestep_prediction.prediction))
                        for attention_obj in single_timestep_prediction.attention_paths:
                            print('%f\tcontext: %s,%s,%s' %
                                  (attention_obj['score'],
                                   attention_obj['token1'],
                                   attention_obj['path'],
                                   attention_obj['token2']))
                else:
                    print('Predicted:')
                    for predicted_seq in method_prediction.predictions:
                        print('\t%s' % predicted_seq.prediction)
 def __init__(self):
     self.__extractor = Extractor()
     self.__tokeniser = Tokeniser()
     self.__tagger = Tagger()
     self.__dataset = Dataset()
     self.__logger = Logger()
Example #37
0
 def __init__(self):
     self.Users = Users()
     self.extractor = Extractor()
     self.list_User = self.extractor.extractorUsers
Example #38
0
class InteractivePredictor:
    exit_keywords = ['exit', 'quit', 'q']

    def __init__(self, config, model):
        model.predict([])
        self.model = model
        self.config = config
        self.path_extractor = Extractor(config,
                                        jar_path=JAR_PATH,
                                        max_path_length=MAX_PATH_LENGTH,
                                        max_path_width=MAX_PATH_WIDTH)

    def read_file(self, input_filename):
        with open(input_filename, 'r') as file:
            return file.readlines()

    def predict(self):
        input_filename = 'Input.java'
        print('Starting interactive prediction...')
        while True:
            print(
                'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit'
                % input_filename)
            user_input = input()
            if user_input.lower() in self.exit_keywords:
                print('Exiting...')
                return
            try:
                predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(
                    input_filename)
            except ValueError as e:
                print(e)
                continue
            results, code_vectors = self.model.predict(predict_lines)
            prediction_results = common.parse_results(results,
                                                      hash_to_string_dict,
                                                      topk=SHOW_TOP_CONTEXTS)
            for i, method_prediction in enumerate(prediction_results):
                print('Original name:\t' + method_prediction.original_name)
                for name_prob_pair in method_prediction.predictions:
                    print('\t(%f) predicted: %s' %
                          (name_prob_pair['probability'],
                           name_prob_pair['name']))
                print('Attention:')
                for attention_obj in method_prediction.attention_paths:
                    print('%f\tcontext: %s,%s,%s' %
                          (attention_obj['score'], attention_obj['token1'],
                           attention_obj['path'], attention_obj['token2']))
                if self.config.EXPORT_CODE_VECTORS:
                    print('Code vector:')
                    print(' '.join(map(str, code_vectors[i])))

    def dn_predict(self):
        # input_filename = 'Input.java'
        # input_filename = input()
        print('Starting interactive prediction...')
        data_list = glob.glob("data/in_use/*/*.java")
        for input_filename in data_list:
            # while True:
            # print(
            # 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename)
            # user_input = input()
            # input_filename = input()
            # if user_input.lower() in self.exit_keywords:
            print(input_filename)
            if input_filename.lower() in self.exit_keywords:
                print('Exiting...')
                return
            try:
                predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(
                    input_filename)
            except ValueError as e:
                print(e)
                continue
            results, code_vectors = self.model.predict(predict_lines)
            prediction_results = common.parse_results(results,
                                                      hash_to_string_dict,
                                                      topk=SHOW_TOP_CONTEXTS)
            for i, method_prediction in enumerate(prediction_results):
                print('Original name:\t' + method_prediction.original_name)
                for name_prob_pair in method_prediction.predictions:
                    print('\t(%f) predicted: %s' %
                          (name_prob_pair['probability'],
                           name_prob_pair['name']))
                print('Attention:')
                for attention_obj in method_prediction.attention_paths:
                    print('%f\tcontext: %s,%s,%s' %
                          (attention_obj['score'], attention_obj['token1'],
                           attention_obj['path'], attention_obj['token2']))
                if self.config.EXPORT_CODE_VECTORS:
                    print('Code vector:')
                    print(' '.join(map(str, code_vectors[i])))
                    with open('jms_output.txt', 'a') as f_out:
                        f_out.write("{}\t{}\n".format(
                            input_filename,
                            ', '.join(map(str, code_vectors[i]))))
Example #39
0
import sys
import cv2
import numpy as np
from data import DataSet
from extractor import Extractor
from keras.models import load_model
from insert1 import insert_img
from Connector_mysql import connect
import winsound
import re
import math
import io
print("Loading Model .......")
saved_LSTM_model = load_model(
    "data\\checkpoints\\lstm-features.022-0.035.hdf5", compile='False')
extract_model = Extractor(image_shape=(320, 240, 3))
print(
    "****************************Model Ready.......***************************"
)


def video(video_file):
    #print('time take to load imports {:0.3f}'.format(time.time() - start))
    start = time.time()
    '''print(sys.argv)
    if (len(sys.argv) == 2):
        #seq_length = int(sys.argv[1])
        #class_limit = int(sys.argv[2])
        #saved_model = sys.argv[3]
        #video_file = sys.argv[1]
    else:
Example #40
0
        Formats the given file.
        """

        if not file_util.is_missing_or_empty_file(raw_output_path):
            xml = etree.parse(raw_output_path, etree.XMLParser(recover=True))

            sections = []

            # Extract the title.
            title_nodes = xml.xpath(title_xpath)
            sections.append("".join(
                [x.text.replace("\n", " ").strip() for x in title_nodes]))

            # Extract the lines.
            section_nodes = xml.xpath(sections_xpath)
            for node in section_nodes:
                line_nodes = node.xpath(line_xpath)
                sections.append("\n".join([
                    x.text.replace("\n", " ").strip() for x in line_nodes
                    if x is not None and x.text is not None
                ]))
            return "\n\n".join(sections)
        return ""


if __name__ == "__main__":
    arg_parser = Extractor.get_argument_parser()
    args = arg_parser.parse_args()

    PdfExtractExtractor(args).process()
Example #41
0
 def __init__(self, config_fp, language="es"):
     Extractor.__init__(self, config_fp)
     self.language = language
Example #42
0
 def __init__(self, config_fp, language="en"):
     Extractor.__init__(self, config_fp)
     # should be modified
     self.language = language
Example #43
0
    def __init__(self):
        super().__init__()

        self.particles = "e"
        self.req_particles = None

        self.pu_mode = None
        self.req_pu_mode = None
        self.req_kickers_mode = False
        self.state = 'idle'
        self.ic_runmode = 'idle'

        self.linStarter = LinStarter()
        self.extractor = Extractor()
        self.modeCtl = ModesClient()
        self.pu_ctl = PUSwitcher()

        self.mode_subsys = [37, 38, 39]

        self.modeCtl.markedReady.connect(self.kickers_loaded)
        self.linStarter.runDone.connect(self.next_state)
        self.extractor.extractionDone.connect(self.next_state)
        self.pu_ctl.switching_done.connect(self.next_state)

        self.timer = cda.Timer()
        self.calibr_timer = cda.Timer()

        self.states = [
            self.__idle, self.__preinject, self.__inject2, self.__injected,
            self.__preextract, self.__extract2, self.__extracted,
            self.__pu_switching, self.__pu_switched
        ]

        # output channels
        self.c_state = cda.StrChan('cxhw:0.ddm.state',
                                   on_update=True,
                                   max_nelems=20)
        self.c_stateMsg = cda.StrChan('cxhw:0.ddm.stateMsg',
                                      on_update=True,
                                      max_nelems=100)

        self.c_icrunmode = cda.StrChan('cxhw:0.ddm.ICRunMode',
                                       on_update=True,
                                       max_nelems=20)

        # command channels
        self.cmds = [
            'stop', 'inject', 'extract', 'nround', 'autorun', 'e2v4', 'p2v4',
            'e2v2', 'p2v2'
        ]
        self.c_cmds = [
            cda.IChan('cxhw:0.ddm.' + x, on_update=True) for x in self.cmds
        ]
        for c in self.c_cmds:
            c.valueMeasured.connect(self.cmd_proc)

        # option-command channels
        self.c_particles = cda.StrChan('cxhw:0.ddm.particles',
                                       on_update=True,
                                       max_nelems=20)
        self.c_particles.valueMeasured.connect(self.particles_update)
        self.c_particles.setValue(self.particles)

        self.c_extr_train = cda.IChan('cxhw:0.ddm.extr_train', on_update=True)
        self.c_extr_train.valueMeasured.connect(self.train_proc)

        self.c_extr_train_interval = cda.DChan(
            'cxhw:0.ddm.extr_train_interval', on_update=True)
        self.c_extr_train_interval.valueMeasured.connect(
            self.train_interval_update)

        # event channels
        self.c_injected = cda.IChan('cxhw:0.ddm.injected', on_update=True)
        self.c_extracted = cda.IChan('cxhw:0.ddm.extracted', on_update=True)

        # beam current channels
        self.c_beamcur = cda.DChan('cxhw:0.dcct.beamcurrent', on_update=True)
        self.c_extr_beamCur = cda.DChan('cxhw:0.dcct.ExtractionCurrent',
                                        on_update=True)

        self.c_v2k_auto = cda.IChan('cxhw:0.ddm.v2k_auto', on_update=True)
        self.c_v2k_particles = cda.StrChan('cxhw:0.bep.particles',
                                           on_update=True,
                                           max_nelems=20)
        self.c_v2k_particles.valueMeasured.connect(self.v2k_auto_mode)
        self.c_v2k_offline = cda.IChan('cxhw:0.bep.offline', on_update=True)
        self.c_v2k_offline.valueMeasured.connect(self.v2k_offline_proc)

        self.linbeam_cor = LinBeamCtl()
class Annotator():
    __job_position_tag = "EMP-POS"
    __job_company_tag = "EMP-COMP"

    __education_course_tag = "EDU-MAJOR"
    __education_institution_tag = "EDU-INST"

    def __init__(self):
        self.__extractor = Extractor()
        self.__tokeniser = Tokeniser()
        self.__tagger = Tagger()
        self.__dataset = Dataset()
        self.__logger = Logger()

    def prepare_dataset(self, nr_of_docs=-1):
        resumes, labels = self.__extractor.read_raw_files(nr_of_docs)

        resumes = self.__tokeniser.tokenise_docs_to_lines(resumes)
        resumes = self.__tokeniser.tokenise_doclines_to_words(resumes)

        self.__dataset.resume_content = self.annotate_docs(resumes, labels)
        self.__dataset.save()

    # resumes: list of tokenised (by line and word) résumé docs
    # labels: xml structure storing labels for several resumes
    def annotate_docs(self, resumes, labels):
        self.__logger.println("annotating resumes")
        annotated_resumes = []
        for idx, resume in enumerate(resumes):
            annotated_resumes.append(self.annotate_doc(resume, labels[idx]))
            self.__logger.println(
                "annotating resume %s/%s with true labels and pos tags" %
                (idx + 1, len(resumes)))

        # non local ner tag entire dataset at a time for speed
        annotated_resumes = self.__tagger.nonlocal_ner_tag(annotated_resumes)
        self.__logger.println("completed annotating resumes")
        return annotated_resumes

    # doc: a single résumé document with token strings in each slot of list
    # labels: xml structure storing pre-extracted information
    def annotate_doc(self, doc, labels):
        job_title_list = self.__extractor.get_job_titles(labels)
        job_company_list = self.__extractor.get_company_names(labels)
        edu_major_list = self.__extractor.get_edu_majors(labels)
        edu_inst_list = self.__extractor.get_edu_institutions(labels)
        # can extract more labels here

        prepared_doc = self.__tagger.prepare_doc(doc)
        prepared_doc = self.__match_entity(prepared_doc, job_title_list,
                                           self.__job_position_tag)
        prepared_doc = self.__match_entity(prepared_doc, job_company_list,
                                           self.__job_company_tag)
        prepared_doc = self.__match_entity(prepared_doc, edu_major_list,
                                           self.__education_course_tag)
        prepared_doc = self.__match_entity(prepared_doc, edu_inst_list,
                                           self.__education_institution_tag)
        prepared_doc = self.__tagger.add_default_entity_tags(prepared_doc)

        prepared_doc = self.__tagger.pos_tag(prepared_doc)

        return prepared_doc

    # doc: résumé doc to be annotated
    # entity_list: list of labels to matched in doc
    # tag: tag to be assigned if match found
    def __match_entity(self, doc, entity_list, tag):
        for entity in entity_list:
            doc = self.__tagger.match_label(doc, entity, tag)
        return doc

    # function takes in a path to file and annotates it for tagging
    # to be ideally used to tag as a one off for testing
    # filepath: path to résumé
    def annotate_using_trained_model(self, filepath):
        resume_content = self.__extractor.read_resume_content(filepath)

        resume_content = self.__tokeniser.tokenise_docs_to_lines(
            resume_content)
        resume_content = self.__tokeniser.tokenise_doclines_to_words(
            resume_content)

        prepared_doc = self.__tagger.prepare_doc(resume_content[0])
        prepared_doc = self.__tagger.pos_tag(prepared_doc)
        prepared_doc = self.__tagger.nonlocal_ner_tag([prepared_doc])

        return prepared_doc[0]
Example #45
0
class InjExtLoop:
    def __init__(self):
        super().__init__()

        self.particles = "e"
        self.req_particles = None

        self.pu_mode = None
        self.req_pu_mode = None
        self.req_kickers_mode = False
        self.state = 'idle'
        self.ic_runmode = 'idle'

        self.linStarter = LinStarter()
        self.extractor = Extractor()
        self.modeCtl = ModesClient()
        self.pu_ctl = PUSwitcher()

        self.mode_subsys = [37, 38, 39]

        self.modeCtl.markedReady.connect(self.kickers_loaded)
        self.linStarter.runDone.connect(self.next_state)
        self.extractor.extractionDone.connect(self.next_state)
        self.pu_ctl.switching_done.connect(self.next_state)

        self.timer = cda.Timer()
        self.calibr_timer = cda.Timer()

        self.states = [
            self.__idle, self.__preinject, self.__inject2, self.__injected,
            self.__preextract, self.__extract2, self.__extracted,
            self.__pu_switching, self.__pu_switched
        ]

        # output channels
        self.c_state = cda.StrChan('cxhw:0.ddm.state',
                                   on_update=True,
                                   max_nelems=20)
        self.c_stateMsg = cda.StrChan('cxhw:0.ddm.stateMsg',
                                      on_update=True,
                                      max_nelems=100)

        self.c_icrunmode = cda.StrChan('cxhw:0.ddm.ICRunMode',
                                       on_update=True,
                                       max_nelems=20)

        # command channels
        self.cmds = [
            'stop', 'inject', 'extract', 'nround', 'autorun', 'e2v4', 'p2v4',
            'e2v2', 'p2v2'
        ]
        self.c_cmds = [
            cda.IChan('cxhw:0.ddm.' + x, on_update=True) for x in self.cmds
        ]
        for c in self.c_cmds:
            c.valueMeasured.connect(self.cmd_proc)

        # option-command channels
        self.c_particles = cda.StrChan('cxhw:0.ddm.particles',
                                       on_update=True,
                                       max_nelems=20)
        self.c_particles.valueMeasured.connect(self.particles_update)
        self.c_particles.setValue(self.particles)

        self.c_extr_train = cda.IChan('cxhw:0.ddm.extr_train', on_update=True)
        self.c_extr_train.valueMeasured.connect(self.train_proc)

        self.c_extr_train_interval = cda.DChan(
            'cxhw:0.ddm.extr_train_interval', on_update=True)
        self.c_extr_train_interval.valueMeasured.connect(
            self.train_interval_update)

        # event channels
        self.c_injected = cda.IChan('cxhw:0.ddm.injected', on_update=True)
        self.c_extracted = cda.IChan('cxhw:0.ddm.extracted', on_update=True)

        # beam current channels
        self.c_beamcur = cda.DChan('cxhw:0.dcct.beamcurrent', on_update=True)
        self.c_extr_beamCur = cda.DChan('cxhw:0.dcct.ExtractionCurrent',
                                        on_update=True)

        self.c_v2k_auto = cda.IChan('cxhw:0.ddm.v2k_auto', on_update=True)
        self.c_v2k_particles = cda.StrChan('cxhw:0.bep.particles',
                                           on_update=True,
                                           max_nelems=20)
        self.c_v2k_particles.valueMeasured.connect(self.v2k_auto_mode)
        self.c_v2k_offline = cda.IChan('cxhw:0.bep.offline', on_update=True)
        self.c_v2k_offline.valueMeasured.connect(self.v2k_offline_proc)

        self.linbeam_cor = LinBeamCtl()

    def v2k_offline_proc(self, chan):
        if self.c_v2k_auto.val == 0 or self.pu_mode not in {'e2v2', 'p2v2'}:
            return
        if self.c_v2k_offline.val == 1:
            self.linbeam_cor.close_beam()
        elif self.c_v2k_offline.val == 0:
            self.linbeam_cor.open_beam()

    def v2k_auto_mode(self, chan):
        if self.c_v2k_auto.val == 0 or self.req_pu_mode is not None:
            return
        if chan.val == 'positrons' and self.pu_mode == 'e2v2':
            self.p2v2()
        if chan.val == 'electrons' and self.pu_mode == 'p2v2':
            self.e2v2()

    def train_interval_update(self, chan):
        if chan.val > 0:
            self.extractor.set_training_interval(chan.val)
        else:
            chan.setValue(self.extractor.training_interval)

    def train_proc(self, chan):
        if chan.val and self.ic_runmode == 'idle':
            self.extractor.start_training()

    def particles_update(self, chan):
        if self.req_pu_mode is not None:
            return
        if self.particles == chan.val or chan.val not in {'e', 'p'}:
            return
        if self.ic_runmode == 'idle':
            self.set_particles(chan.val)
        else:
            self.req_particles = chan.val

    def set_particles(self, p):
        if self.particles == p:
            return
        self.particles = p
        self.linStarter.set_particles(self.particles)
        if self.c_particles.val != p:
            self.c_particles.setValue(p)

    def set_pu_mode(self, mode):
        if self.pu_mode == mode:
            return
        self.req_pu_mode = mode
        if self.ic_runmode == 'idle':
            self.run_state('pu_switching')

    def kickers_loaded(self):
        if self.req_kickers_mode:
            self.timer.singleShot(80, self.next_state)
            self.req_kickers_mode = False

    def run_state(self, state=None):
        if state is not None:
            self.state = state
        self.c_state.setValue(self.state)
        if self.ic_runmode == 'idle':
            return
        s_ind = state_names.index(self.state)
        self.c_stateMsg.setValue(stateMsg[s_ind])
        self.states[s_ind]()

    def next_state(self):
        s_ind = state_names.index(self.state)
        ns_ind = s_ind + 1
        if ns_ind < len(state_names):
            self.state = state_names[ns_ind]
            self.run_state()

    def __idle(self):
        pass

    def __preinject(self):

        if self.req_particles is not None:
            self.set_particles(self.req_particles)
            self.req_particles = None
        if self.req_pu_mode is not None:
            self.run_state('pu_switching')
            return

        self.req_kickers_mode = True
        self.modeCtl.load_marked(self.particles + 'inj', self.mode_subsys,
                                 ['rw'])

    def __inject2(self):
        self.linStarter.start()

    def __injected(self):
        self.c_injected.setValue(1)
        if self.ic_runmode in {"single-cycle", "auto-cycle"}:
            self.next_state()

    def __preextract(self):
        self.req_kickers_mode = True
        self.modeCtl.load_marked(self.particles + 'ext', self.mode_subsys,
                                 ['rw'])

    def __extract2(self):
        self.c_extr_beamCur.setValue(self.c_beamcur.val)
        self.extractor.extract()

    def __extracted(self):
        self.c_extracted.setValue(1)
        if self.ic_runmode == "auto-cycle":
            self.state = "preinject"
            self.run_state()

    def __pu_switching(self):
        if self.req_pu_mode is None:
            print('mode not requested')
            return
        self.modeCtl.load_marked(self.req_pu_mode, [7])
        self.set_particles(self.req_pu_mode[0])
        self.pu_ctl.switch_mode(self.req_pu_mode)

    def __pu_switched(self):
        self.pu_mode = self.req_pu_mode
        self.req_pu_mode = None
        if self.ic_runmode == "auto-cycle":
            self.run_state("preinject")
        else:
            self.run_state('idle')

    def cmd_proc(self, chan):
        if chan.first_cycle:
            return
        sn = chan.short_name()
        getattr(self, sn)()

    def set_runmode(self, runmode):
        self.ic_runmode = runmode
        self.c_icrunmode.setValue(runmode)

    def stop(self):
        self.linStarter.stop()
        self.extractor.stop()
        self.set_runmode('idle')
        self.run_state('idle')

    def inject(self):
        self.set_runmode("single-action")
        self.run_state('preinject')

    def extract(self):
        # check if something injected
        self.set_runmode("single-action")
        self.run_state('preextract')

    def nround(self):
        self.set_runmode("single-cycle")
        self.run_state('preinject')

    def autorun(self):
        self.set_runmode("auto-cycle")
        self.run_state('preinject')

    def e2v4(self):
        self.set_pu_mode('e2v4')

    def p2v4(self):
        self.set_pu_mode('p2v4')

    def e2v2(self):
        self.set_pu_mode('e2v2')

    def p2v2(self):
        self.set_pu_mode('p2v2')
Example #46
0
 def __init__(self, config, model):
     model.predict([])
     self.model = model
     self.config = config
     self.path_extractor = Extractor(config, EXTRACTION_API, self.config.MAX_PATH_LENGTH, max_path_width=2)
Example #47
0
def main():
    parser = argparse.ArgumentParser("PyTorch Face Recognizer")
    parser.add_argument('cmd',
                        type=str,
                        choices=['train', 'test', 'extract'],
                        help='train, test or extract')
    parser.add_argument('--arch_type',
                        type=str,
                        default='resnet50_ft',
                        help='model type',
                        choices=[
                            'resnet50_ft', 'senet50_ft', 'resnet50_scratch',
                            'senet50_scratch'
                        ])
    parser.add_argument('--dataset_dir',
                        type=str,
                        default='/path/to/dataset_directory',
                        help='dataset directory')
    # parser.add_argument('--log_file', type=str, default='/path/to/log_file', help='log file')
    # parser.add_argument('--train_img_list_file', type=str, default='/path/to/train_image_list.txt',
    #                     help='text file containing image files used for training')
    # parser.add_argument('--test_img_list_file', type=str, default='/path/to/test_image_list.txt',
    #                     help='text file containing image files used for validation, test or feature extraction')
    # parser.add_argument('--meta_file', type=str, default='/path/to/identity_meta.csv', help='meta file')
    # parser.add_argument('--checkpoint_dir', type=str, default='/path/to/checkpoint_directory',
    #                     help='checkpoints directory')
    parser.add_argument('--feature_dir',
                        type=str,
                        default='/path/to/feature_directory',
                        help='directory where extracted features are saved')
    # parser.add_argument('-c', '--config', type=int, default=1, choices=configurations.keys(),
    #                     help='the number of settings and hyperparameters used in training')
    # parser.add_argument('--batch_size', type=int, default=32, help='batch size')
    # parser.add_argument('--resume', type=str, default='', help='checkpoint file')
    parser.add_argument('--weight_file',
                        type=str,
                        default='/path/to/weight_file.pkl',
                        help='weight file')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('-j',
                        '--workers',
                        default=4,
                        type=int,
                        metavar='N',
                        help='number of data loading workers (default: 4)')
    # parser.add_argument('--horizontal_flip', action='store_true',
    #                     help='horizontally flip images specified in test_img_list_file')
    args = parser.parse_args()
    print(args)

    if args.cmd == "extract":
        utils.create_dir(args.feature_dir)

    if args.cmd == 'train':
        utils.create_dir(args.checkpoint_dir)
        cfg = configurations[args.config]

    log_file = args.log_file
    # resume = args.resume

    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
    cuda = torch.cuda.is_available()
    if cuda:
        print("torch.backends.cudnn.version: {}".format(
            torch.backends.cudnn.version()))

    torch.manual_seed(1337)
    if cuda:
        torch.cuda.manual_seed(1337)

    # 0. id label map
    # meta_file = args.meta_file
    # id_label_dict = utils.get_id_label_map(meta_file)

    # 1. data loader
    root = args.dataset_dir
    # train_img_list_file = args.train_img_list_file
    # test_img_list_file = args.test_img_list_file

    kwargs = {'num_workers': args.workers, 'pin_memory': True} if cuda else {}

    # if args.cmd == 'train':
    #     dt = datasets.VGG_Faces2(root, train_img_list_file, id_label_dict, split='train')
    #     train_loader = torch.utils.data.DataLoader(dt, batch_size=args.batch_size, shuffle=True, **kwargs)

    dv = datasets.VGG_Faces2(root,
                             test_img_list_file,
                             id_label_dict,
                             split='valid',
                             horizontal_flip=args.horizontal_flip)
    val_loader = torch.utils.data.DataLoader(dv,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             **kwargs)

    # 2. model
    include_top = True if args.cmd != 'extract' else False
    if 'resnet' in args.arch_type:
        model = ResNet.resnet50(num_classes=N_IDENTITY,
                                include_top=include_top)
    else:
        model = SENet.senet50(num_classes=N_IDENTITY, include_top=include_top)
    # print(model)

    start_epoch = 0
    start_iteration = 0
    utils.load_state_dict(model, args.weight_file)
    # if resume:
    #     checkpoint = torch.load(resume)
    #     model.load_state_dict(checkpoint['model_state_dict'])
    #     start_epoch = checkpoint['epoch']
    #     start_iteration = checkpoint['iteration']
    #     assert checkpoint['arch'] == args.arch_type
    #     print("Resume from epoch: {}, iteration: {}".format(start_epoch, start_iteration))
    # else:
    #     utils.load_state_dict(model, args.weight_file)
    #     if args.cmd == 'train':
    #         model.fc.reset_parameters()

    if cuda:
        model = model.cuda()

    criterion = nn.CrossEntropyLoss()
    if cuda:
        criterion = criterion.cuda()

    extractor = Extractor(
        cuda=cuda,
        model=model,
        val_loader=val_loader,
        log_file=log_file,
        feature_dir=args.feature_dir,
        flatten_feature=True,
        print_freq=1,
    )
    extractor.extract()
Example #48
0
 def extract_data(cls, file):
     """"Uses extraction method from Extractor class"""
     ext = Extractor()
     ext.set_file(file)
     return ext.get_component_dictionary()
Example #49
0
from gensim.models import KeyedVectors
from extractor import Extractor
import pandas as pd
import sys

if __name__ == '__main__':
    assert len(
        sys.argv
    ) == 5, "Need trained word2vec path / dataset path / product id (-1 to work with all)  / max ngrams per tfidf"
    w2v_path = sys.argv[1]
    w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)
    extractor = Extractor()
    extractor.word2vec = w2v
    df_path = sys.argv[2]
    df = pd.read_csv(df_path)
    products = list(set(df["PRODUCT"]))
    products.sort()
    product_id = int(sys.argv[3])

    def shor_product(product_id):
        max_ngram_per_tfidf = int(sys.argv[4])
        index = df["PRODUCT"] == product_id
        texts = list(df.loc[index, "TEXT"]) + \
                list(df.loc[index, "BENEFITS"]) + \
                list(df.loc[index, "DRAWBACKS"])
        texts = list(map(str, filter(bool, texts)))
        extracted = extractor.transform(texts, 1, 4, max_ngram_per_tfidf)
        print(product_id)
        print(extracted)

    if product_id == -1: