Python Extractorの例、extractor.Extractor Pythonの例

コード例 #1

0

ファイルを表示

 def extract_metadata(self):
     if not hasattr(self, "_extract_metadata_cache"):
         if self.base_file:
             e = extractor.Extractor()
             self._extract_metadata_cache = e.extractFromFile(self.base_file.path.encode())
         elif self.url_field:
             e = extractor.Extractor()
             http = httplib2.Http()
             response, body = http.request(self.url_field)
             self._extract_metadata_cache = e.extractFromData(data=body, size=len(body))
         else:
             self._extract_metadata_cache = None
     return self._extract_metadata_cache

コード例 #2

0

ファイルを表示

    def on_idle(self):
        while (not self.in_q.empty()):
            temp = self.in_q.get()
            if type(temp) is list:
                #draw contours
                contours = temp[0]
                color = temp[1]
                primary_label = temp[2]
                normals = temp[3]
                self.make_display_lists(contours, color / 255.0, primary_label,
                                        normals)
                self.draw()
            else:
                #add new extractor
                args = temp.split()
                location = (int(args[0]), int(args[1]), int(args[2]))

                primary_id = []
                secondary_ids = []
                split_str = re.split(":", args[3])
                primary_id = [int(split_str[0])]
                if split_str[1] != "":
                    secondary_ids = [
                        int(label) for label in re.split(',', split_str[1])
                    ]
                ids = [primary_id + secondary_ids]

                extr = extractor.Extractor(self.in_q, self.directory, ids,
                                           location, max_x, max_y)
                extracting_worker = threading.Thread(target=extr.run,
                                                     name="extr")
                extracting_worker.daemon = True
                extracting_worker.start()

コード例 #3

0

ファイルを表示

 def process_code(self, code, reduce_mode_mini=False):
     E = extractor.Extractor([
         code,
     ], self)
     E.extract()
     if len(self.years) > 1:
         max_year = max(self.years)
     else:
         max_year = max(self.years) + 1
     if self.aggregate_visits and not code.startswith('C'):
         aggregate_visits.aggregate_events(
             code,
             E.get_inpatient_visits(code),
             self.identifier,
             self.result_dir,
             reduce_mode_mini=reduce_mode_mini)
     if self.aggregate_readmits:
         unlinked_nodes, nodes, edges = E.get_readmit_nodes_edges(code)
         aggregate_edges.readmits.aggregate_readmits(
             code, unlinked_nodes, nodes, edges, self.identifier,
             self.result_dir, max_year)
     if self.aggregate_revisits:
         subsets = E.get_revisit_nodes_edges(code)
         for k, v in subsets.iteritems():
             aggregate_edges.revisits.aggregate_revisits(
                 k, v['unlinked_nodes'], v['nodes'], v['edges'],
                 self.identifier, self.result_dir, max_year)
     if self.aggregate_patients:
         aggregate_patients.aggregate_patients(code, E.get_patients(code),
                                               self.identifier,
                                               self.result_dir,
                                               self.patients)

コード例 #4

0

ファイルを表示

ファイル: wsm_extractor_plugin.py プロジェクト: chokribr/inveniotest

def read_metadata_local(inputfile, verbose):
    """
    Metadata extraction from many kind of files

    @param inputfile: path to the image
    @type inputfile: string
    @param verbose: verbosity
    @type verbose: int
    @rtype: dict
    @return: dictionary with metadata
    """
    # Initialization dict
    meta_info = {}

    # Extraction
    xtract = extractor.Extractor()

    # Get the keywords
    keys = xtract.extract(inputfile)

    # Loop to dump data to the dict
    for keyword_type, keyword in keys:
        meta_info[keyword_type.encode('iso-8859-1')] = \
            keyword.encode('iso-8859-1')

    # Return the dictionary
    return meta_info

コード例 #5

0

ファイルを表示

ファイル: script.py プロジェクト: LinguList/Cognates

def groupDeduction(measure):
    # Feature extraction
    ext = extractor.Extractor()

    if measure == constants.IDENTICAL_WORDS:
        predictedLabels, predictedSets = ext.identicalWordsGroupBaseline(
            prr.testMeanings, prr.testLanguages, rdr.wordforms)
    elif measure == constants.IDENTICAL_PREFIX:
        predictedLabels, predictedSets = ext.identicalPrefixGroupBaseline(
            prr.testMeanings, prr.testLanguages, rdr.wordforms)
    elif measure == constants.IDENTICAL_LETTER:
        predictedLabels, predictedSets = ext.identicalFirstLetterGroupBaseline(
            prr.testMeanings, prr.testLanguages, rdr.wordforms)

    trueLabels = ext.extractGroupLabels(rdr.cognateSets, rdr.wordforms,
                                        prr.testMeanings, prr.testLanguages)

    # Evaluation
    lrn = learner.Learner()
    V1scores = {
        meaningIndex: lrn.computeV1(trueLabels[meaningIndex],
                                    predictedLabels[meaningIndex])
        for meaningIndex in prr.testMeanings
    }

    # Reporting
    output.reportGroup(constants.DEDUCERS[measure], V1scores, rdr.meanings)
    output.saveGroup("output/Group " + constants.DEDUCERS[measure] + ".txt",
                     predictedSets)

コード例 #6

0

ファイルを表示

ファイル: script.py プロジェクト: LinguList/Cognates

def pairwiseDeduction(measure):
    # Feature extraction
    ext = extractor.Extractor()

    if measure == constants.IDENTICAL_WORDS:
        ext.identicalWordsBaseline(prr.examples, prr.labels)
    elif measure == constants.IDENTICAL_PREFIX:
        ext.identicalPrefixBaseline(prr.examples, prr.labels)
    elif measure == constants.IDENTICAL_LETTER:
        ext.identicalFirstLetterBaseline(prr.examples, prr.labels)

    predictions = ext.testExamples.reshape((ext.testExamples.shape[0], ))

    # Evaluation
    lrn = learner.Learner()
    accuracy = lrn.computeAccuracy(ext.testLabels, predictions)
    F1 = lrn.computeF1(ext.testLabels, predictions)
    report = lrn.evaluatePairwise(ext.testLabels, predictions)

    # Reporting
    output.reportPairwiseDeduction(constants.DEDUCERS[measure], prr, accuracy,
                                   F1, report)
    output.savePredictions(
        "output/Pairwise " + constants.DEDUCERS[measure] + ".txt",
        prr.examples[constants.TEST], ext.testExamples, predictions,
        ext.testLabels)

    return predictions

コード例 #7

0

ファイルを表示

ファイル: downloader.py プロジェクト: sharkone/nzb2http

    def run(self):
        sys.stdout.write('[nzb2http][downloader] Started\n')
        self.stop_requested = False

        self.extractor = extractor.Extractor(self.get_first_rar_path())
        self.extractor.start()

        for incomplete_file in self.incomplete_files:
            map_result_async = self.pool.map_async(_run_worker,
                                                   incomplete_file.segments)
            while not self.stop_requested:
                try:
                    map_result = map_result_async.get(1)
                    self._write_nzb_file(incomplete_file, map_result)
                    sys.stdout.write(
                        '[nzb2http][downloader] Downloaded {0}\n'.format(
                            incomplete_file.path))
                    break
                except multiprocessing.TimeoutError:
                    pass

        self.pool.terminate()
        self.pool.join()

        sys.stdout.write('[nzb2http][downloader] Stopped\n')

コード例 #8

0

ファイルを表示

 def __init__(self, fld, pool):
     self.creation_date = datetime.now()
     self.pool = pool
     self.fld = fld
     self.rowid = None
     self.thread_id = uuid.uuid4().hex
     self.extractor = extractor.Extractor(self.fld, None, self.pool)
     self.crawl_counter = 0

コード例 #9

0

ファイルを表示

def extract(input_file):
    sys.path.append('./scripts')
    import extractor
    e = extractor.Extractor(input_file, 'images', True, False, False, '127.0.0.1', None)
    ocwd = os.getcwd()
    (iid, repeated) = e.extract()
    os.chdir(ocwd)
    return (iid, repeated)

コード例 #10

0

ファイルを表示

ファイル: script.py プロジェクト: LinguList/Cognates

def HK2011Pairwise(twoStage=False):
    # 1st Pass
    # Feature extraction
    ext = extractor.Extractor()
    ext.HK2011Baseline(prr.examples, prr.labels)

    # Learning
    lrn = learner.Learner()
    lrn.initSVM(0.1)
    lrn.fitSVM(ext.trainExamples, ext.trainLabels)

    # Prediction
    predictions1 = lrn.predictSVM(ext.testExamples)

    # Evaluation
    accuracy = lrn.computeAccuracy(ext.testLabels, predictions1)
    F1 = lrn.computeF1(ext.testLabels, predictions1)
    report = lrn.evaluatePairwise(ext.testLabels, predictions1)

    # Reporting
    stage = "HK2011 1st Pass"
    output.reportPairwiseLearning(stage, prr, accuracy, F1, report)
    output.savePredictions("output/" + stage + ".txt",
                           prr.examples[constants.TEST], ext.testExamples,
                           predictions1, ext.testLabels)

    # 2nd Pass
    if twoStage:
        # Feature extraction
        ext.appendBinaryLanguageFeatures(prr.examples, prr.labels,
                                         constants.TEST, prr.testLanguages)

        # Learning
        lrn = learner.Learner()
        lrn.initSVM(0.0001)
        lrn.fitSVM(ext.testExamples, predictions1)

        # Prediction
        predictions2 = lrn.predictSVM(ext.testExamples)

        # Evaluation
        accuracy = lrn.computeAccuracy(ext.testLabels, predictions2)
        F1 = lrn.computeF1(ext.testLabels, predictions2)
        report = lrn.evaluatePairwise(ext.testLabels, predictions2)

        # Reporting
        stage = "HK2011 2nd Pass"
        output.reportPairwiseLearning(stage, prr, accuracy, F1, report)
        output.savePredictions("output/" + stage + ".txt",
                               prr.examples[constants.TEST], ext.testExamples,
                               predictions2, ext.testLabels)

        # Significance
        print constants.SIGNIFICANCE.format(
            lrn.computeMcNemarSignificance(ext.testLabels, predictions1,
                                           predictions2))

    return ext, lrn

コード例 #11

0

ファイルを表示

ファイル: main.py プロジェクト: valery-barysok/faceGenerator

def main():
    e = extractor.Extractor()
    # e.loadFilesToExtract() # extracts layers into images
    # e.generate(20)
    c = cutout.Cutout(
        "E:\\Projects\\2019\\The Game\\steam\\generator\\gmic\\gmic.exe",
        "E:\\Projects\\2019\\The Game\\steam\\generator\\skript\\output\\product\\myCutout.gmic",
        "E:\\Projects\\output", "E:\\Projects\\output\\out")
    c.runCutout()

コード例 #12

0

ファイルを表示

def read_file(request):
    global data, urls, ext
    ext = extractor.Extractor('fld.com', None, p)
    p.database.setup_database()
    p.disable_processing = True
    urls = {'https://fld.com/arg',
                'http://fld.com/monster/depth',
                'https://fld.com/robot/wut'
                }
    with open('tests/utility/extractor_test_website.html') as f:
        data = f.read()

コード例 #13

0

ファイルを表示

ファイル: import_data.py プロジェクト: jahow/nodejs

def import_topo_to_postgis(path):
    '''Import data from the BD TOPO IGN
    into the postgis database'''
    bd_topo_extractor = extractor.Extractor(path)
    bd_topo_extractor.get_files_by_format('shp')
    bd_topo_extractor.bulk_create()
    for shapefile in bd_topo_extractor.tables:
        bd_topo_extractor.insert_data_from_shapefile(shapefile, **bd_topo_extractor.tables[shapefile])

    #bd_topo_extractor.commit_to_database()
    import_to_geoserver(bd_topo_extractor)

コード例 #14

0

ファイルを表示

ファイル: script.py プロジェクト: LinguList/Cognates

def treeFeatureSelection():
    # Feature extraction
    ext = extractor.Extractor()
    ext.appendWordSimilarityFeatures(prr.examples, prr.labels, ext.allMeasures)

    # Feature selection
    lrn = learner.Learner()
    lrn.initForest(250, 0)
    lrn.fitForest(ext.trainExamples, ext.trainLabels)
    importances = lrn.getForestImportances()

    # Reporting
    for i, feature in enumerate(ext.allMeasures):
        print "{0}: {1:.4f}".format(feature, importances[i])

コード例 #15

0

ファイルを表示

ファイル: viewer_3d.py プロジェクト: rajtrivedi2001/rhoana

 def on_idle(self):
     timer = time.time()
     while (not self.in_q.empty() and time.time() - timer < .1):
         self.icon_color = (
             self.icon_color + .01
         ) % 1  #resets to black when icon is green since 1.0 and 0.0 %1 are equal
         temp = self.in_q.get()
         if temp[0] == "marker":
             self.pick_location = temp[1:][0]
             self.pick_location[0] = int(
                 float(self.pick_location[0] * self.columns) / self.max_x)
             self.pick_location[1] = int(
                 float(self.pick_location[1] * self.rows) / self.max_y)
         elif temp[0] == "ids":
             self.num_labels += 1
             label_idx = self.num_labels
             self.label_dict[label_idx] = temp[1:][0][0][0]
             extr = extractor.Extractor(self.in_q, self.directory,
                                        temp[1:][0], self.pick_location,
                                        self.max_x, self.max_y, label_idx)
             self.extractor_dict[temp[1][0][0]] = extr
             extracting_worker = threading.Thread(target=extr.run,
                                                  name="extr")
             extracting_worker.daemon = True
             extracting_worker.start()
         elif temp[0] == "contours":
             contours = temp[1]
             color = temp[2]
             primary_label = temp[3]
             normals = temp[4]
             label_idx = temp[5]
             if self.make_lists:
                 self.make_display_lists(contours, color / 255.0,
                                         primary_label, normals, label_idx)
         elif temp[0] == "limits":
             self.max_x = temp[1]
             self.max_y = temp[2]
             self.layers = temp[3]
         elif temp[0] == "refresh":
             self.refresh()
         elif temp[0] == "remove":
             self.remove_label(temp[1:][0])
         self.st = time.time()
         glutPostRedisplay()
     #set icon to green if processes are done
     if time.time() - self.st > 0.25:
         self.icon_color = np.array((0.0, 1.0, 0.0))
         self.make_lists = True
         glutPostRedisplay()

コード例 #16

0

ファイルを表示

    def setUp(self):
        self.extract = extractor.Extractor()
        self.extract.ROOT_DIR = 'C:\\SIS'
        self.extract.EXCLUDE_DIRS = [r'.\SIS\ACBr', 
                                     r'.\SIS\SISMobile', 
                                     r'.\SIS\SISDLL'] 

        self.extract.EXCLUDE_FILES_WITH = ['900A.dfm',
                                           'BARVERTICAL',
                                           'Frame',
                                           'PAI']

        self.extract.VALID_EXTENSION = '.dfm'
        self.extract.CSV_FILE = 'result.csv'
        self.extract.formList = {}
        if os.path.isfile('result.csv'):
            os.remove('result.csv')

コード例 #17

0

ファイルを表示

def extract_metadata(sender, instance, field_mapping, force=False):
    """ Extract and populate metadata from the file itself.
        @force: Overwrite existing metadata
    """
    import extractor as libextractor
    extractor = libextractor.Extractor(lang="en")

    if not extractor:
        return

    all_keywords = extractor.extract(data=instance.file.read(),
                                     size=instance.file.size)
    keywords = dict(all_keywords)

    for attr, field in field_mapping.items():
        if field in keywords and (force or not hasattr(instance, attr)):
            # 1. Extract data
            value = keywords[field].encode('iso-8859-1')

            # 2. Post-extraction processing
            try:
                value = getattr(instance, 'process_metadata_%s' % attr)(value)

            # No value processing defined, maybe try some basic automatic processing
            except AttributeError:
                # Date/time processing
                if isinstance(instance._meta.get_field(field),
                              (models.DateField, models.DateTimeField)):
                    for pattern in ('%Y-%m-%dT%H:%M:%SZ', '%Y%m%d%H%M%S'):
                        try:
                            # String is trimmed to the size of pattern, assuming that
                            # it is the same length as the string it is matching (coincidently, it often is!).
                            value = datetime.strptime(value[:len(pattern)],
                                                      pattern)
                        except ValueError:
                            continue

            # 3. Set the discovered value
            if value:
                setattr(instance, attr, value)

    # TODO: Other keywords might have multiple values, it would be better to handle that properly
    if hasattr(instance, 'plaintext'):
        for key, value in all_keywords:
            if key == 'unknown':
                instance.plaintext += ' ' + value

コード例 #18

0

ファイルを表示

    def get_hidden_states(self):
        # TODO: documentation
        """
        
        """

        ## reset rnn states
        self.model.reset_states()

        ## predict test set
        labels = self.model.predict(self.input, batch_size=BATCH_SIZE)

        ## extract states for all sequences accross all chars.
        ex = extractor.Extractor(self.model, [0])
        states = ex.get_states(self.input, batch_size=BATCH_SIZE, \
                unshuffle=True)
        states_perchar = states.reshape(
            self.input.shape[0] * self.input.shape[1], -1)
        return states_perchar

コード例 #19

0

ファイルを表示

ファイル: __init__.py プロジェクト: yejunbin/ComputationalHealthcare

 def process_code(self, code):
     E = extractor.Extractor([
         code,
     ], self)
     E.extract()
     if len(self.years) > 1:
         max_year = max(self.years)
     else:
         max_year = max(self.years) + 1
     # if self.aggregate_visits:
     #     aggregate_visits.aggregate_events(code, E.get_inpatient_visits(code), self.identifier, self.result_dir,reduce_mode_mini=False)
     # if self.aggregate_readmits:
     #     unlinked_nodes, nodes, edges = E.get_readmit_nodes_edges(code)
     #     aggregate_edges.readmits.aggregate_readmits(code, unlinked_nodes, nodes, edges, self.identifier, self.result_dir,max_year)
     # if self.aggregate_revisits:
     #     subsets = E.get_revisit_nodes_edges(code)
     #     for k, v in subsets.iteritems():
     #         aggregate_edges.revisits.aggregate_revisits(k, v['unlinked_nodes'], v['nodes'], v['edges'], self.identifier, self.result_dir, max_year)
     # if self.aggregate_patients:
     #     aggregate_patients.aggregate_patients(code,E.get_patients(code),self.identifier,self.result_dir,self.patients)
     raise NotImplementedError

コード例 #20

0

ファイルを表示

ファイル: script.py プロジェクト: LinguList/Cognates

def pairwiseLearning(minimal=False):
    # Feature extraction
    ext = extractor.Extractor()
    ext.consonantPrep = rdr.consonants
    ext.soundClassPrep = rdr.soundClasses

    if minimal:
        ext.appendWordSimilarityFeatures(prr.examples, prr.labels,
                                         ext.minimalMeasures)
        ext.appendPOSTags(prr.examples, prr.labels, rdr.POSTags)
    else:
        ext.appendWordSimilarityFeatures(prr.examples, prr.labels, [
            ext.commonBigramRatio, ext.commonTrigramNumber, ext.bigramDice,
            ext.jaroDistance
        ])
        ext.appendWordSimilarityFeatures(prr.examples, prr.labels,
                                         [ext.identicalWords], rdr.consonants)
        ext.appendWordSimilarityFeatures(
            prr.examples, prr.labels,
            [ext.LCPLength, ext.commonBigramNumber, ext.identicalPrefix],
            rdr.soundClasses)
        ext.appendPOSTags(prr.examples, prr.labels, rdr.POSTags)
        ext.appendLetterFeatures(prr.examples, prr.labels)
        ext.appendSameLanguageGroupFeatures(prr.examples, prr.labels)

    # Learning
    lrn, predictions = learn(ext, 0.0001)

    # Reporting
    stage = "Pairwise Learning"
    accuracy = lrn.computeAccuracy(ext.testLabels, predictions)
    F1 = lrn.computeF1(ext.testLabels, predictions)
    report = lrn.evaluatePairwise(ext.testLabels, predictions)

    output.reportPairwiseLearning(stage, prr, accuracy, F1, report)
    output.savePredictions("output/" + stage + ".txt",
                           prr.examples[constants.TEST], ext.testExamples,
                           predictions, ext.testLabels)

    return ext, lrn

コード例 #21

0

ファイルを表示

ファイル: viewer_3d_7_24_4.py プロジェクト: rajtrivedi2001/rhoana

    def on_idle(self):
        while (not self.in_q.empty()):
            self.icon_color = (
                self.icon_color + .01
            ) % 1  #resets to black when icon is green since 1.0 and 0.0 %1 are equal
            temp = self.in_q.get()
            if type(temp) is list:
                #draw contours
                contours = temp[0]
                color = temp[1]
                primary_label = temp[2]
                normals = temp[3]
                self.make_display_lists(contours, color / 255.0, primary_label,
                                        normals)
                self.draw()
            else:
                #add new extractor
                args = temp.split()
                location = (int(args[0]), int(args[1]), int(args[2]))

                primary_id = []
                secondary_ids = []
                split_str = re.split(":", args[3])
                primary_id = [int(split_str[0])]
                if split_str[1] != "":
                    secondary_ids = [
                        int(label) for label in re.split(',', split_str[1])
                    ]
                ids = [primary_id + secondary_ids]

                extr = extractor.Extractor(self.in_q, self.directory, ids,
                                           location, max_x, max_y)
                extracting_worker = threading.Thread(target=extr.run,
                                                     name="extr")
                extracting_worker.daemon = True
                extracting_worker.start()
            self.st = time.time()
        if time.time() - self.st > 0.5:
            self.icon_color = np.array((0.0, 1.0, 0.0))
        self.draw()

コード例 #22

0

ファイルを表示

    def makeReq(self):
        self.ensureHtmlDirExists()
        with requests.Session() as c:
            try:
                res = c.post(self.targetUrl,
                             data=self.loginPayload,
                             headers={
                                 "Referer":
                                 "https://slcm.manipal.edu/loginForm.aspx"
                             })
                if res.url.endswith('loginForm.aspx'):  #login failure
                    self.loginError = True
                else:  #login success
                    self.loginError = False
                    if os.path.isdir(self.htmlSavePath +
                                     self.username) == False:
                        os.mkdir(self.htmlSavePath + self.username)
                    if self.efficient == False:
                        homePageCode = c.get(
                            'https://slcm.manipal.edu/studenthomepage.aspx')
                        self.saveHtmlFile(homePageCode, '_homepage')
                    self.gatherHtmlFiles(c)
                # else:
                #     self.loginError = True
            except:
                self.collectionError = True

            if self.loginError == False and self.collectionError == False:
                print('[C] Launching extractor...')
                newData = extractor.Extractor(self.username, self.password)
                newData.scrapeEverything()
                self.attendanceData = newData.attendanceData
                self.marksData = newData.marksData
                if newData.extractionError == True:
                    self.errorDuringExtraction = True
            else:
                if self.loginError:
                    print('[C] [ERROR] Login Error')
                if self.collectionError:
                    print('[C] [ERROR] Collection Error')

コード例 #23

0

ファイルを表示

ファイル: extract.py プロジェクト: GoncaloKLopes/rnndissect

        acts_file = "../../model/simple_rnn_2layers.pt"
    elif config_str == "rnn3":
        config = RNN_CONFIG3
        acts_file = "../../model/simple_rnn_2layers_bidir.pt"
    elif config_str == "lstm1":
        config = LSTM_CONFIG1
        acts_file = "../../model/lstm.pt"
    elif config_str == "lstm2":
        config = LSTM_CONFIG2
        acts_file = "../../model/lstm_2layers.pt"
    elif config_str == "lstm3":
        config = LSTM_CONFIG3
        acts_file = "../../model/lstm_2layers_bidir.pt"
    else:
        raise ("NOT A VALID CONFIG.")

    config.output_dim = 2

    model = BinarySARNN(config)
    model.load_state_dict(torch.load(acts_file))
    model.to(DEVICE)

    with open(os.path.join("../../reviews", input_file)) as f:
        inpt = f.read()

    json_fname = config_str + "_" + input_file[:-4] + ".json"
    output = rnndissect.utils.model_utils.classify(model, inpt)

    ex = extr.Extractor(config, model)
    ex.activations_to_json(inpt, json_fname)

コード例 #24

0

ファイルを表示

     This file is part of libextractor.
     (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff

     libextractor is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published
     by the Free Software Foundation; either version 2, or (at your
     option) any later version.

     libextractor is distributed in the hope that it will be useful, but
     WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with libextractor; see the file COPYING.  If not, write to the
     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
     Boston, MA 02111-1307, USA.

Little demo how to use the libextractor Python binding.

"""
import extractor
import sys

xtract = extractor.Extractor()
for arg in sys.argv[1:]:
    print "Keywords from %s:" % arg
    keys = xtract.extract(arg)
    for keyword_type, keyword in keys:
        print "%s - %s" % (keyword_type.encode('iso-8859-1'), keyword.encode('iso-8859-1'))

コード例 #25

0

ファイルを表示

ファイル: extractfsm.py プロジェクト: ianhuang0630/FSMRNN

    def extract(self):
        ## extract states from the model trained through the test set
        self.rnn.reset_states()
        print("predicting labels...")

        ## predictions to label states positive and negative
        labels = self.rnn.predict(self.data, batch_size=BATCH_SIZE)

        ## extract hidden states for all sequences across all chars.
        ex = extractor.Extractor(self.rnn, [0])
        states = ex.get_states(self.data, batch_size=BATCH_SIZE, \
                unshuffle=True)
        states_perchar = states.reshape(self.data.shape[0]*self.data.shape[1],-1)

        ## clustering using k-means
        print("Clustering...")
        kmeans = KMeans(n_clusters=self.k_num, random_state=0)
        kmeans.fit(states_perchar)
        print("Complete.")
        print("Inertia: {}".format(kmeans.inertia_))



        #######################################################################
        # search for first occurrence of each state in kmeans.labels_
        # TODO: randomize, and make it pass thorugh all examples 
        first_occur = self.find_first(kmeans.labels_)
        # indices correspond to the index in 250000,16, change first_occur to seq_num and char_num
        first_seq_char = {element: [first_occur[element]//50, first_occur[element]%50] for element in first_occur}
        #######################################################################

        ## create state objects and visited/unvisited markers
        states = [State("s{}".format(i)) for i in range(self.k_num)]
        visited = [False]*self.k_num

        ## initializing variables for BFS
        current_state_num = 0
        visited[0] = True
        q = deque()
        q.append(states[0])

        # create DFA object
        dfa = DFA()
        dfa.set_omega(self.alphabet)
        dfa.set_Q(states)
        dfa.set_q_0(states[0])

        prev_input = None
        counter = 0; threshold = 100

        while q:

            print("Iteration # {}".format(counter))
            print("------------------------------")

            counter += 1
            if counter > threshold:
                raise Warning("while loop doesn't seem to exit")

            current_state = q.popleft()
            current_state_num = current_state.get_state_num()

            # label of the timestep is
            label_state = labels[first_occur[current_state_num]]
            # TODO: label_state is a double -- may have to round?

            if label_state == 1:
                dfa.add_F(current_state)

            # locate the first occurrence of current_state

            seq_num = first_seq_char[current_state_num][0]
            char_num = first_seq_char[current_state_num][1]
            sofar_str = self.data[seq_num][:char_num+1]

            # sofar_str should lead to the state being observed
            
            # this is naive -- because of padding.
            prev_input = sofar_str[-1] # this is the last letter in the previous sequence

            print("prev character = {}".format(self.arr2char(prev_input.tolist())))

            for i in self.next_alphabet(prev_input):

                print("Trying character {}".format(self.arr2char(i.tolist())))

                # add i at the back of the test sequence
                new = np.vstack((sofar_str, i))
                len_string = len(i)

                # new will not be 50 in length anymore. This can be remedied by 
                # deleting the first len(i) elements, given that they are X's
                
                if new.shape[0] > 50:
                    if any([ all(element == np.array([1,0,0,0,0])) for element in new[:len_string]]):
                        new = new[len_string:]
                    else:
                        raise ValueError("The sequence doesn't start with enough X's")
                elif new.shape[0] < 50:
                    insert = np.tile(np.array([1,0,0,0,0]), (50 - new.shape[0],1))
                    new = np.vstack((insert, new))

                # ok... so this part is super inefficient, but i think it works.
                train_copy = self.data[seq_num:seq_num + BATCH_SIZE].copy()
                train_copy[0] = new

                # plug sequence into RNN model
                new_states = ex.get_states(train_copy, 
                        batch_size=BATCH_SIZE, unshuffle=True)
                
                # feature extraction at position first_occurrance +1 
                
                next_s = new_states[0,-16:]

                # kmeans classification

                next_state_num = kmeans.predict(next_s)[0] # next_state is an int
                next_state = states[next_state_num]

                # if the next state has already been visited, add_delt, but
                # don't enqueue.

                trigger_char = self.arr2char(i.tolist())

                current_state.add_delt(trigger_char, next_state) 
                # linking current_state to a State object
                dfa.add_delta(current_state, trigger_char, next_state)

                if not visited[next_state_num]:
                    # enqueue
                    q.append(next_state)

            # process of adding to DFA object

        return dfa, states

コード例 #26

0

ファイルを表示

    def train(self, items, limit=None):
        self.clusters = {}
        self.noise = []

        items = list(items)

        # Extract the features we want to use for clustering from the items
        self.extractor = extractor.Extractor()
        self.features = self.extractor.fit_transform(items, limit=limit)

        if self.verbose:
            sys.stderr.write("{0}: Items to cluster\n".format(
                len(self.features)))

        jobs = os.cpu_count() or -1
        start = time.perf_counter()

        # Initialize the NCD code with our log feature. Currently only
        # one feature is used: the normalized log
        X = ncd.prepare(
            map(lambda features: features[extractor.FEATURE_LOG],
                self.features))

        # Calculate all the pairwise distances between the items in question
        # The scikit DBSCAN implementation does this anyway, poorly. So why not
        # do it ahead of time and parralelize it ... which we do here. Then we
        #
        # TODO: This takes forever and is an O(n^2) operation
        # There is significant room for improvement both here, and in the following
        # DBSCAN usage and implementation. Techniques such as feature/item selection
        # BIRCH, ball trees, or many other things could make this better/faster
        matrix = sklearn.metrics.pairwise.pairwise_distances(X,
                                                             metric=ncd.metric,
                                                             state=ncd.state,
                                                             n_jobs=jobs)

        if self.verbose:
            sys.stderr.write(
                "{0}: Computed distances in {1} seconds on {2} cores\n".format(
                    int((len(self.features) * len(self.features)) / 2),
                    int(time.perf_counter() - start), jobs))

        # Actually perform the clustering. This is fast compared to above
        min_samples = min(self.min_samples, len(self.features) / 10)
        dbs = sklearn.cluster.DBSCAN(metric='precomputed',
                                     eps=self.eps,
                                     min_samples=min_samples)
        dbs.fit(matrix)
        labels = dbs.labels_

        # Create clusters of all the items
        clusters = {}
        noise = []
        for i, label in enumerate(labels):
            if label == -1:
                noise.append(i)
            else:
                if label not in clusters:
                    clusters[label] = []
                clusters[label].append(i)
        self.clusters = {}
        for label, indexes in clusters.items():
            self.clusters[label] = Cluster(label, indexes)
        self.noise = Cluster(None, noise)

        # Print out a rough description of that
        if self.verbose:
            sys.stderr.write("{0}: Clusters ({1} items, {2} noise)\n".format(
                len(self.clusters.keys()),
                len(self.features) - len(noise), len(noise)))

        # Setup our neighbors classifier for predict()
        self.neighbors = sklearn.neighbors.KNeighborsClassifier(
            metric='precomputed', weights='distance')
        self.neighbors.fit(matrix, labels)

        # Collapse the high dimensionality of the matrix data
        #
        # HACK: scikit-learn has a parallelization bug where pairwise_distances
        # returns a non-symmetric array for certain floats when using loky (default)
        # parallelization. The MDS call requires a symmetric array up to E-10
        matrix = sklearn.utils.validation.check_symmetric(matrix)
        self.squashed = sklearn.manifold.MDS(
            n_components=2, dissimilarity='precomputed').fit_transform(matrix)

コード例 #27

0

ファイルを表示

    max_x = int(sys.argv[4])
    max_y = int(sys.argv[5])

    ids = []
    for label_set in (sys.argv[6:len(sys.argv)]):
        primary_id = []
        secondary_ids = []
        split_str = re.split(":", label_set)
        primary_id = [int(split_str[0])]
        if split_str[1] != "":
            secondary_ids = [
                int(label) for label in re.split(',', split_str[1])
            ]
        ids += [primary_id + secondary_ids]

    extr = extractor.Extractor(display_queue, directory, ids, location, max_x,
                               max_y)
    viewer = Viewer(location, display_queue, directory, max_x, max_y)
    handler = handler.Input_Handler(display_queue)

    viewer.set_dimensions(extr.rows, extr.columns, extr.layers, extr.w)

    extracting_worker = threading.Thread(target=extr.run, name="extr")
    input_worker = threading.Thread(target=handler.run, name="input_worker")

    input_worker.daemon = True
    extracting_worker.daemon = True
    extracting_worker.start()
    input_worker.start()

    viewer.main()

コード例 #28

0

ファイルを表示

import extractor
import visualizer

sims = extractor.Extractor("file path here")

sims.analyze(n=10, topics=25)

コード例 #29

0

ファイルを表示

ファイル: gmMimeLib.py プロジェクト: rockdriven/gnumed

def guess_mimetype(filename=None):
    """Guess mime type of arbitrary file.

	filenames are supposed to be in Unicode
	"""
    worst_case = "application/octet-stream"
    _log.debug('guessing mime type of [%s]', filename)
    # 1) use Python libextractor
    try:
        import extractor
        xtract = extractor.Extractor()
        props = xtract.extract(filename=filename)
        for prop, val in props:
            if (prop == 'mimetype') and (val != worst_case):
                return val
    except ImportError:
        _log.debug(
            'module <extractor> (python wrapper for libextractor) not installed'
        )
    except OSError as exc:
        # winerror 126, errno 22
        if exc.errno != 22:
            raise
        _log.exception(
            'module <extractor> (python wrapper for libextractor) not installed'
        )

    ret_code = -1
    # 2) use "file" system command
    #    -i get mime type
    #    -b don't display a header
    mime_guesser_cmd = 'file -i -b "%s"' % filename
    # this only works on POSIX with 'file' installed (which is standard, however)
    # it might work on Cygwin installations
    aPipe = os.popen(mime_guesser_cmd, 'r')
    if aPipe is None:
        _log.debug("cannot open pipe to [%s]" % mime_guesser_cmd)
    else:
        pipe_output = aPipe.readline().replace('\n', '').strip()
        ret_code = aPipe.close()
        if ret_code is None:
            _log.debug('[%s]: <%s>' % (mime_guesser_cmd, pipe_output))
            if pipe_output not in ['', worst_case]:
                return pipe_output.split(';')[0].strip()
        else:
            _log.error('[%s] on %s (%s): failed with exit(%s)' %
                       (mime_guesser_cmd, os.name, sys.platform, ret_code))

    # 3) use "extract" shell level libextractor wrapper
    mime_guesser_cmd = 'extract -p mimetype "%s"' % filename
    aPipe = os.popen(mime_guesser_cmd, 'r')
    if aPipe is None:
        _log.debug("cannot open pipe to [%s]" % mime_guesser_cmd)
    else:
        pipe_output = aPipe.readline()[11:].replace('\n', '').strip()
        ret_code = aPipe.close()
        if ret_code is None:
            _log.debug('[%s]: <%s>' % (mime_guesser_cmd, pipe_output))
            if pipe_output not in ['', worst_case]:
                return pipe_output
        else:
            _log.error('[%s] on %s (%s): failed with exit(%s)' %
                       (mime_guesser_cmd, os.name, sys.platform, ret_code))

    # If we and up here we either have an insufficient systemwide
    # magic number file or we suffer from a deficient operating system
    # alltogether. It can't get much worse if we try ourselves.

    _log.info("OS level mime detection failed, falling back to built-in magic")

    import gmMimeMagic
    mime_type = gmTools.coalesce(gmMimeMagic.filedesc(filename), worst_case)
    del gmMimeMagic
    _log.debug('"%s" -> <%s>' % (filename, mime_type))
    return mime_type

コード例 #30

0

ファイルを表示

ファイル: extractfsm2.py プロジェクト: ianhuang0630/FSMRNN

    def extract(self):
        ## extract states from the model trained through the test set
        self.rnn.reset_states()
        print("predicting labels...")

        ## predictions to label states positive and negative
        labels = self.rnn.predict(self.data, batch_size=BATCH_SIZE)

        ## extract hidden states for all sequences across all chars.
        ex = extractor.Extractor(self.rnn, [0])
        states = ex.get_states(self.data, batch_size=BATCH_SIZE, \
          unshuffle=True)
        states_perchar = states.reshape(
            self.data.shape[0] * self.data.shape[1], -1)

        ## clustering using k-means
        print("Clustering...")
        kmeans = KMeans(n_clusters=self.k_num, random_state=0)
        kmeans.fit(states_perchar)
        print("Complete.")
        print("Inertia: {}".format(kmeans.inertia_))

        ## finding the sequence & char number in testing data corresponding
        ## to every state
        occur = self.find_occurance(kmeans.labels_)

        ## cut down list naively  -- may replace with randomized sampling

        for state in occur:
            threshold = int(FRACTION * len(occur[state]))
            replacement = []
            for i in range(threshold):
                replacement.append(occur[state][i])
            occur[state] = replacement

        ## Instantiate dfa object
        dfa = DFA()
        dfa.set_omega(self.alphabet)
        dfa.set_Q(self.k_labels)
        dfa.initiate_delta()

        # TODO: set q_0? Also not sure about F

        ######################################################################
        ## create state objects and visited/unvisited markers
        # states = [State("s{}".format(i)) for i in range(self.k_num)]
        # visited = [False]*self.k_num

        # ## initializing variables for BFS
        # current_state_num = 0
        # visited[0] = True
        # q = deque()
        # q.append(states[0])

        # # create DFA object
        # dfa = DFA()
        # dfa.set_omega(self.alphabet)
        # dfa.set_Q(states)
        # dfa.set_q_0(states[0])

        ######################################################################

        prev_input = None

        total_processes = sum([len(element) for element in occur.values()])
        print("{} total processes to be run".format(total_processes))

        import ipdb
        ipdb.set_trace()

        block = total_processes / 50.0
        counter = 0

        for cluster in occur.keys():

            # occur[cluster] holds list of tuples
            instances = occur[cluster]

            for (seq_num, char_num) in instances:
                ## Display loading bar:
                counter += 1
                print("#" * int(counter / block) + " " *
                      (50 - int(counter / block)) +
                      "| {} / {}".format(counter, total_processes))

                ## Find the corresponding section of the test data
                chopped = self.data[seq_num][:char_num + 1]
                prev_input = chopped[-1]

                print("prev character: {}".format(
                    self.arr2char(prev_input.tolist())))

                for i in self.next_alphabet(prev_input):
                    print("trying character: {}".format(
                        self.arr2char(i.tolist())))

                    new_seq = np.vstack((prev_input, i))

                    if new_seq.shape[0] > 50:
                        over = new_seq.shape[0] - 50

                        ## check if the first "over" elements are X's, if not,
                        ## this is an unsuable example
                        if all([
                                np.array_equal(new_seq[i],
                                               np.array([1, 0, 0, 0, 0]))
                                for i in range(over)
                        ]):
                            new_seq = new_seq[over:]

                        else:
                            print("".join(
                                [self.arr2char(row) for row in new_seq]) +
                                  " does not contain enough X's")
                            pass  # skip out of for loop

                    elif new_seq.shape[0] < 50:
                        over = 50 - new_seq.shape[0]
                        insert = np.tile(np.array([1, 0, 0, 0, 0]), (over, 1))
                        new_seq = np.vstack((insert, new_seq))

                    ## putting sequence into block of size BATCH_SIZE
                    train_copy = self.data[seq_num:seq_num + BATCH_SIZE].copy()
                    train_copy[0] = new_seq

                    ## Find hidden state
                    new_states = ex.get_states(train_copy,
                                               batch_size=BATCH_SIZE,
                                               unshuffle=True)
                    next_s = new_states[0, -16:]

                    ## using SVM to predict the quantized state for hidden state
                    next_state_num = kmeans.predict(next_s[np.newaxis, :])[0]

                    ## increment the (state1, transition, state2) in dfa in delta function
                    transition = self.arr2char(i.tolist())
                    dfa.add_delta(cluster, transition, next_state_num)

        # convert counts in dfa.delta to probabilities.
        dfa.delta_count2prob()

        return dfa