Exemple #1
0
    def test_median(self):
        """ Проверка медианы"""
        data = [8, 1, 5, 3, 4, 2, 6, 7, 1, 9]
        result = median(data)
        self.assertEqual(result, 4.5)

        data.pop(0)
        result = median(data)
        self.assertEqual(result, 5.0)
Exemple #2
0
def request_device_readings_quartiles(device_uuid):
    """
    This endpoint allows clients to GET the 1st and 3rd quartile
    sensor reading value for a device.

    Mandatory Query Parameters:
    * type -> The type of sensor value a client is looking for
    * start -> The epoch start time for a sensor being created
    * end -> The epoch end time for a sensor being created
    """

    if request.data:
        post_data = json.loads(request.data)
        type = post_data.get('type', None)
        if not type or type not in ('temperature', 'humidity'):
            return 'error on the required type data', 400
        start = post_data.get('start', None)
        if not start:
            return 'error on the required start data', 400
        end = post_data.get('end', None)
        if not end:
            return 'error on the required end data', 400
    else:
        return 'missing data in the request parameters', 400

    # Set the db that we want and open the connection
    if app.config['TESTING']:
        conn = sqlite3.connect('test_database.db')
    else:
        conn = sqlite3.connect('database.db')
    conn.row_factory = sqlite3.Row
    cur = conn.cursor()

    sql = 'SELECT r.value from readings r WHERE r.type = ? AND r.device_uuid = ? AND r.date_created >= ? AND r.date_created <= ?'
    params = [type, device_uuid, start, end]

    sql += 'ORDER BY r.value'

    # Execute the query
    cur.execute(sql, params)
    rows = [row[0] for row in cur.fetchall()]

    mid = len(rows) // 2

    if (len(rows) % 2 == 0):
        # even
        lowerQ = median(rows[:mid])
        upperQ = median(rows[mid:])
    else:
        # odd
        lowerQ = median(rows[:mid])  # same as even
        upperQ = median(rows[mid + 1:])

    return str(lowerQ) + "," + str(upperQ), 200
Exemple #3
0
    def CrossValidation(self, cv_method=0, **args):
        """Select ncomp by the requested CV method"""
        validation = self.model["validation"].AsDataFrame()

        # method 0: select the fewest components with PRESS within 1 stdev of the least PRESS (by the bootstrap)
        if cv_method == 0:  # Use the bootstrap to find the standard deviation of the MSEP
            # Get the leave-one-out CV error from R:
            columns = min(self.num_predictors, self.ncomp_max)
            cv = array.array("d", validation["pred"].AsVector())
            rows = len(cv) / columns
            cc = []
            for k in range(int(columns)):
                b = k * rows
                e = b + rows
                cc.append(array.array("d", cv[b:e]))
            cv = cc

            # PRESS = map(lambda x: sum((cv[:,x]-self.array_actual)**2), range(cv.shape[1]))
            PRESS = [sum([(cv[i][j] - self.actual[j]) ** 2 for j in range(rows)]) for i in range(int(columns))]
            # ncomp = np.argmin(PRESS)
            ncomp = [i for i in range(len(PRESS)) if PRESS[i] == min(PRESS)][0]

            # cv_squared_error = (cv[:,ncomp]-self.array_actual)**2
            cv_squared_error = [(cv[ncomp][j] - self.actual[j]) ** 2 for j in range(int(rows))]
            sample_space = xrange(rows)

            PRESS_stdev = list()

            # Cache random number generator and int's constructor for a speed boost
            _random, _int = random.random, int

            for i in range(100):
                PRESS_bootstrap = list()

                for j in range(100):
                    PRESS_bootstrap.append(sum([cv_squared_error[_int(_random() * rows)] for i in sample_space]))

                PRESS_stdev.append(utils.std(PRESS_bootstrap))

            med_stdev = utils.median(PRESS_stdev)

            # Maximum allowable PRESS is the minimum plus one standard deviation
            good_ncomp = [i for i in range(len(PRESS)) if PRESS[i] < min(PRESS) + med_stdev]
            self.ncomp = int(min(good_ncomp) + 1)

        # method 1: select the fewest components w/ PRESS less than the minimum plus a 4% of the range
        if cv_method == 1:
            # PRESS stands for predicted error sum of squares
            PRESS0 = validation["PRESS0"][0]
            PRESS = list(validation["PRESS"])

            # the range is the difference between the greatest and least PRESS values
            PRESS_range = abs(PRESS0 - min(PRESS))

            # Maximum allowable PRESS is the minimum plus a fraction of the range.
            max_CV_error = min(PRESS) + PRESS_range / 25
            good_ncomp = [i for i in range(len(PRESS)) if PRESS[i] < max_CV_error]

            # choose the most parsimonious model that satisfies that criterion
            self.ncomp = int(min(good_ncomp) + 1)
Exemple #4
0
def results_metrics():
    """
    Aggregator for all student results data
    :return: dictionary for all student data
    """
    quizzes = Quiz.objects.all()
    metrics = {}
    for quiz in quizzes:
        results = Results.objects.filter(quiz=quiz)
        quiz_json = json.loads(quiz.quizjson)
        scores = []
        single_metrics = {}
        for result in results:
            print result
            scores.append(result.score)
        print scores
        single_metrics['scores'] = scores
        single_metrics['name'] = quiz.name
        single_metrics['num_of_questions'] = len(quiz_json['questions'])
        mean = utils.average(scores)
        single_metrics['class_av'] = mean
        single_metrics['std_dev'] = utils.std_deviation(scores, mean)
        single_metrics['subject'] = quiz.subject
        single_metrics['high'] = max(scores)
        single_metrics['low'] = min(scores)
        single_metrics['class_median'] = utils.median(scores)
        metrics[quiz.name] = single_metrics
    return metrics
Exemple #5
0
 def printlevelstat(self, level, n=0):
     med = median((i.nleaves for i in level.itervalues()))
     self.logger.debug(output.green(' ' * n + "MED %f nleaves=%d len(k)=%d depth=%d"),
                                    med,
                                    level.nleaves,
                                    len(level.keys()),
                                    level.depth)
     for k, v in level.iteritems():
         nleaves = v.nleaves
         depth = v.depth
         if v and v.clusterable:
             self.logger.debug(
                     output.yellow(
                         ' ' * n + "K %s nleaves=%d r=%.2f depth=%d"),
                     k,
                     nleaves,
                     float(nleaves)/med,
                     depth)
         else:
             self.logger.debug(
                     output.green(
                         ' ' * n + "K %s nleaves=%d r=%.2f depth=%d"),
                     k,
                     nleaves,
                     float(nleaves)/med,
                     depth)
         if v:
             self.printlevelstat(v, n+1)
Exemple #6
0
    def make_plan(self, state):
        curr_state = copy.deepcopy(state)
        if self.active_goal is None:
            self.active_goal = self.uncompleted_goals[0]

        problem = self.services.problem_generator.generate_problem(
            self.active_goal, curr_state)
        self.plan = self.services.planner(self.services.pddl.domain_path,
                                          problem)

        for i in range(len(self.plan)):
            action = self.plan[i]
            curr_state_hash = encode_state(curr_state)
            weight = float(i + 1) / len(self.plan)
            if self.weights[curr_state_hash][action.lower()] < weight:
                self.weights[curr_state_hash][action.lower()] = weight
            curr_state = my_apply_action_to_state(curr_state, action,
                                                  self.services.parser)

        local_weights = list()
        for state_hash in self.weights:
            vals = list(self.weights[state_hash].values())
            local_weights.extend(vals)
        self.state_recurrence_punish = median(local_weights)
        self.lookahead = min([4, int(len(self.plan) / 2)])
Exemple #7
0
    def run_test(self, test_cmd):
        ###############################################################################
        if self._cd:
            test_path, test_exe = os.path.split(test_cmd)
            test_path = None if not test_path else test_path
        else:
            test_exe = test_cmd
            test_path = None

        self.machine_specific_init(self._scaling_exp.threads)
        self.test_specific_init(test_exe, self._scaling_exp.threads)

        cmd = self.formulate_cmd(test_exe)
        results = []
        with open("{}.perf.log".format(
                os.path.split(test_exe)[1].split(" ")[0]),
                  "w",
                  encoding="utf-8") as fd:
            fd.write(cmd + "\n\n")
            fd.write("ENV: \n{}\n\n".format(run_cmd_no_fail("env")))
            for _ in range(self._num_runs):
                output = run_cmd_no_fail(cmd,
                                         from_dir=test_path,
                                         verbose=(not self._plot_friendly
                                                  or self._verbose))
                fd.write(output + "\n\n")
                results.append(self.get_time(output))

            threads = self.get_threads(output)

        return median(results), threads
Exemple #8
0
 def build(points):
     if len(points) == 1:
         return Node(None, None, points[0])
     else:
         points = sorted(points, key=lambda x: x[1])
         m_idx = median(points)
         v_left = Tree.build(points[:m_idx+1])
         v_right = Tree.build(points[m_idx+1:])
         v_val = points[m_idx]
         return Node(v_left, v_right, v_val)
Exemple #9
0
 def build(points):
     if len(points) == 1:
         return Node(None, None, points[0])
     else:
         points = sorted(points, key=lambda x: x[1])
         m_idx = median(points)
         v_left = Tree.build(points[:m_idx + 1])
         v_right = Tree.build(points[m_idx + 1:])
         v_val = points[m_idx]
         return Node(v_left, v_right, v_val)
def clock_gets(number=100):
    times = []
    for i in xrange(number):
        start = clock()
        x=get_message()
        time_taken = clock() - start
        times.append(time_taken)
    median_time = median(times)
    mean_time = mean(times)
    stddev_time = stddev(times)
    return (median_time,mean_time,stddev_time)
Exemple #11
0
 def robust_normal_generator(self, x, relative): 
   if relative:
     raise KeyError("robust_gaussian is not meaningful when 'relative' is True.")
   else:
     return tf.random.truncated_normal( # TODO Should be truncated by min and max, not 2x std
       array_ops.shape(x),
       mean=median(x),
       stddev=iqr(x),
       dtype=tf.dtypes.float32,
       seed=None,
       name="robust_normal_noise_generator"
     )
Exemple #12
0
def calc_summary_of_window(window: list, win_count: int) -> Text:
    """ Расчет всех статистических параметров
    :param win_count: счетчик окон
    :param window: список значений (окно)
    :return:
    """
    v_min = min(window)
    v_max = max(window)
    v_avg = sum(window) / WINDOW_SIZE
    v_median = median(window)

    return ('window: ' + str(win_count), 'Max: ' + str(v_max) + '; Min: ' +
            str(v_min) + '; Avg: ' + str(v_avg) + '; Mdn: ' + str(v_median))
Exemple #13
0
    def build(pts):
        if len(pts) == 1:
            return Node2D(None, None, pts[0], Tree(pts))
        else:
            pts = sorted(pts, key= lambda x:x[0])

            m_idx = median(pts)
            xSmallerOrEqual = pts[:m_idx+1]
            xLarger = pts[m_idx+1:]
            v_left = Tree2D.build(xSmallerOrEqual)
            v_right = Tree2D.build(xLarger)
            associatedStr = Tree(pts)
            v_val = pts[m_idx]
            return Node2D(v_left, v_right, v_val, associatedStr)
Exemple #14
0
    def build(pts):
        if len(pts) == 1:
            return Node2D(None, None, pts[0], Tree(pts))
        else:
            pts = sorted(pts, key=lambda x: x[0])

            m_idx = median(pts)
            xSmallerOrEqual = pts[:m_idx + 1]
            xLarger = pts[m_idx + 1:]
            v_left = Tree2D.build(xSmallerOrEqual)
            v_right = Tree2D.build(xLarger)
            associatedStr = Tree(pts)
            v_val = pts[m_idx]
            return Node2D(v_left, v_right, v_val, associatedStr)
Exemple #15
0
def request_device_readings_median(device_uuid):
    """
    This endpoint allows clients to GET the median sensor reading for a device.

    Mandatory Query Parameters:
    * type -> The type of sensor value a client is looking for

    Optional Query Parameters
    * start -> The epoch start time for a sensor being created
    * end -> The epoch end time for a sensor being created
    """

    if request.data:
        post_data = json.loads(request.data)
        type = post_data.get('type', None)
        if not type or type not in ('temperature', 'humidity'):
            return 'error on the required type data', 400
        start = post_data.get('start', None)
        end = post_data.get('end', None)
    else:
        return 'missing data in the request parameters', 400

    # Set the db that we want and open the connection
    if app.config['TESTING']:
        conn = sqlite3.connect('test_database.db')
    else:
        conn = sqlite3.connect('database.db')
    conn.row_factory = sqlite3.Row
    cur = conn.cursor()

    sql = 'SELECT r.value from readings r WHERE r.type = ? AND r.device_uuid = ?'
    params = [type, device_uuid]
    if start:
        sql += 'AND r.date_created >= ?'
        params += [start]
    if end:
        sql += 'AND r.date_created <= ?'
        params += [end]

    sql += 'ORDER BY r.value'

    # Execute the query
    cur.execute(sql, params)
    rows = [row[0] for row in cur.fetchall()]

    if len(rows) == 0:
        return 'No results found', 200

    return str(median(rows)), 200
Exemple #16
0
    def analyze_logcat(self):
        """
        __start_report12.853116__end_report

        We will parse the syntax here and build up a {name:[value,],} hash.
        Next we will compute the median value for each name.
        Finally we will report the geomtric mean of all of the median values.
        """
        self.loggerdeco.debug('analyzing logcat')

        re_data = re.compile('.*__start_report([0-9\.]+)__end_report.*')

        attempt = 1
        max_time = 90  # maximum time to wait for completeness score
        wait_time = 3  # time to wait between attempts
        max_attempts = max_time / wait_time

        results = {"tcheck3": []}
        pageload_metric = {'summary': 0}
        while attempt <= max_attempts and pageload_metric['summary'] == 0:
            buf = self.logcat.get()
            for line in buf:
                match = re_data.match(line)
                if match:
                    numbers = match.group(1)
                    if numbers:
                        results["tcheck3"].append(float(numbers))

            if self.fennec_crashed:
                # If fennec crashed, don't bother looking for pageload metric
                break
            if pageload_metric['summary'] == 0:
                sleep(wait_time)
                attempt += 1

            if not results["tcheck3"]:
                continue

            # calculate score
            data = results["tcheck3"]
            pageload_metric["tcheck3"] = median(data)
            pageload_metric['summary'] = geometric_mean(data)

        if pageload_metric['summary'] == 0:
            self.loggerdeco.info('Unable to find pageload metric')

        self.loggerdeco.info("returning from logcat analyze with: %s" %
                             pageload_metric)
        return pageload_metric
Exemple #17
0
def get_period(kic):
    frequencies = []
    df_list = []
    filenames = utils.get_filenames(utils.BASE_PATH + str(kic), "csv")
    if len(filenames) <= 1:
        return {"period": 0.0, "fap": 0.0, "theta": 0.0, "periods": []}
 
    for idx, filename in enumerate(filenames):
        if (idx > 2):
            data = utils.pd.read_csv(utils.BASE_PATH + str(kic) + "/" + filename)
            try:
                freq = utils.get_freq_LS(data.TIME.to_numpy(),data.PDCSAP_FLUX.to_numpy(),data.EFPDC.to_numpy())
                frequencies.append(freq)
            except Exception as e:
                print(e)
                print(idx)
                print(kic)

            df_list.append(data)
    
    df = utils.pd.DataFrame()
    for _df in df_list:
        df = df.append(_df)        
          
    t = df.TIME.to_numpy()
    y = df.FPDC.to_numpy()
    dy = df.EFPDC.to_numpy()
    
    period1 = utils.get_period(t, y, dy, frequencies)
    period2 = utils.get_period(t, y, dy)    
    
    periods = [period1, period2]
    nbins = 3
    
    if period2 < 0.09 or period2 > 100:
        period = period1
        theta = None
    else:
        try:  
            period, theta = utils.get_period_pdm(t, y, dy, periods, nbins)
        except:
            period = utils.median(periods) 
            theta = None   
    
    df = None
    data = None
    df_list = []
    return {"period": period, "theta": theta, "periods": periods}
Exemple #18
0
    def scanlevels(self, level, n=0):
        for k, v in level.iteritems():
            nleaves = v.nleaves
            if v: # if there are descendants
                # XXX magic number
                # require more than X pages in a cluster

                # require some diversity in the dom path in order to create a link
                med = median((i.nleaves for i in v.itervalues()))
                if nleaves > med and nleaves > 8*(1+1.0/(n+1)) and len(k) > 7.0*math.exp(-n) \
                        and n >= 3:
                    v.clusterable = True
                    level.clusterable = False
                else:
                    v.clusterable = False
                    self.scanlevels(v, n+1)
Exemple #19
0
    def scanlevelspath(self, level, path, n=0):
        v = level[path[0]]
        nleaves = v.nleaves if hasattr(v, "nleaves") else len(v)
        if v: # if there are descendants
            # XXX magic number
            # requrire more than X pages in a cluster

            # require some diversity in the dom path in order to create a link
            med = median((i.nleaves for i in v.itervalues()))
            if nleaves > med and nleaves > 8*(1+1.0/(n+1)) and len(path[0]) > 7.0*math.exp(-n) \
                    and n >= 3:
                v.newclusterable = True
                level.newclusterable = False
            else:
                v.newclusterable = False
            self.scanlevelspath(v, path[1:], n+1)
        if not hasattr(level, "clusterable"):
            level.clusterable = False
Exemple #20
0
def calc_median_metric(log, max_cnt_label, logscale_flag, extra_tail=0):
    metrics = []
    for e in log.orig_err_logs:
        le = [(x[0], x[1]) for x in e if x[0] <= max_cnt_label]
        length = len(le)
        if length < len(e):
            p = (max_cnt_label - le[-1][0]) / (e[length][0] - le[-1][0])
            le.append(
                (max_cnt_label, le[-1][1] + (e[length][1] - le[-1][1]) * p))
            length += 1
        le += [((max_cnt_label + 1) * (1 + extra_tail), le[-1][1])]
        res = 0
        if logscale_flag:
            for i in range(1, length + 1):
                res += 0.5 * (le[i][1] + le[i - 1][1]) * math.log(
                    (le[i][0] + 1.0) / (le[i - 1][0] + 1.0))
        else:
            for i in range(1, length + 1):
                res += 0.5 * (le[i][1] + le[i - 1][1]) * (le[i][0] -
                                                          le[i - 1][0])
        metrics.append(res)
    return utils.median(metrics)
Exemple #21
0
    def run_test(self, exename):
        ###############################################################################
        self.machine_specific_init(self._scaling_exp.threads)
        self.test_specific_init(exename, self._scaling_exp.threads)
        prefix = "" if "NUMA_PREFIX" not in os.environ else "{} ".format(
            os.environ["NUMA_PREFIX"])
        cmd = "{}./{} {}".format(
            prefix, exename, " ".join([
                str(item)
                for item in self._scaling_exp.values(incl_threads=False)
            ]))
        results = []
        with open("{}.perf.log".format(exename), "w") as fd:
            fd.write(cmd + "\n\n")
            fd.write("ENV: \n{}\n\n".format(run_cmd_no_fail("env")))
            for _ in range(self._num_runs):
                output = run_cmd_no_fail(cmd, verbose=not self._plot_friendly)
                fd.write(output + "\n\n")
                results.append(self.get_time(output))

            threads = self.get_threads(output)

        return median(results), threads
Exemple #22
0
    def CrossValidation(self, cv_method=0, **args):
        '''Select ncomp by the requested CV method'''
        validation = self.model['validation'].AsDataFrame()

        #method 0: select the fewest components with PRESS within 1 stdev of the least PRESS (by the bootstrap)
        if cv_method == 0:  #Use the bootstrap to find the standard deviation of the MSEP
            #Get the leave-one-out CV error from R:
            columns = min(self.num_predictors, self.ncomp_max)
            cv = array.array('d', validation['pred'].AsVector())
            rows = len(cv) / columns
            cc = []
            for k in range(int(columns)):
                b = k * rows
                e = b + rows
                cc.append(array.array('d', cv[b:e]))
            cv = cc

            #PRESS = map(lambda x: sum((cv[:,x]-self.array_actual)**2), range(cv.shape[1]))
            PRESS = [
                sum([(cv[i][j] - self.actual[j])**2 for j in range(rows)])
                for i in range(int(columns))
            ]
            #ncomp = np.argmin(PRESS)
            ncomp = [i for i in range(len(PRESS)) if PRESS[i] == min(PRESS)][0]

            #cv_squared_error = (cv[:,ncomp]-self.array_actual)**2
            cv_squared_error = [(cv[ncomp][j] - self.actual[j])**2
                                for j in range(int(rows))]
            sample_space = xrange(rows)

            PRESS_stdev = list()

            #Cache random number generator and int's constructor for a speed boost
            _random, _int = random.random, int

            for i in range(100):
                PRESS_bootstrap = list()

                for j in range(100):
                    PRESS_bootstrap.append(
                        sum([
                            cv_squared_error[_int(_random() * rows)]
                            for i in sample_space
                        ]))

                PRESS_stdev.append(utils.std(PRESS_bootstrap))

            med_stdev = utils.median(PRESS_stdev)

            #Maximum allowable PRESS is the minimum plus one standard deviation
            good_ncomp = [
                i for i in range(len(PRESS))
                if PRESS[i] < min(PRESS) + med_stdev
            ]
            self.ncomp = int(min(good_ncomp) + 1)

        #method 1: select the fewest components w/ PRESS less than the minimum plus a 4% of the range
        if cv_method == 1:
            #PRESS stands for predicted error sum of squares
            PRESS0 = validation['PRESS0'][0]
            PRESS = list(validation['PRESS'])

            #the range is the difference between the greatest and least PRESS values
            PRESS_range = abs(PRESS0 - min(PRESS))

            #Maximum allowable PRESS is the minimum plus a fraction of the range.
            max_CV_error = min(PRESS) + PRESS_range / 25
            good_ncomp = [
                i for i in range(len(PRESS)) if PRESS[i] < max_CV_error
            ]

            #choose the most parsimonious model that satisfies that criterion
            self.ncomp = int(min(good_ncomp) + 1)
Exemple #23
0
 def finish(self):
     f = open("pos_variances.txt","w")
     for i in range(len(self.posnames)):
         mean = utils.mean(self.counts[i])
         f.write(self.posnames[i] + "\t" + str(mean) + "\t" + str(utils.median(self.counts[i])) + "\t" + str(utils.variance(self.counts[i])) + "\t" + str(utils.moment(self.counts[i],mean,3)) + "\t" + str(utils.moment(self.counts[i],mean,4)) +  "\t" + str(len([x for x in self.counts[i] if x > 0])) + "\n")
Exemple #24
0
def pdf2heads(opts, args):
    global Verbose_flag
    xmltag = True
    highlight = False
    titleonly = False
    authonly = False
    Verbose_flag = False
    look_for_all_caps_headings = False
    global automatic_rerunning
    global Found_abstract
    global Found_Sammanfattning

    start_to_exclude = False

    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
        if (o == '--title'):
            titleonly = True
        elif (o == '--author'):
            authonly = True
        elif (o == '--verbose'):
            Verbose_flag = True
            print "Verbose_flag is on"
        elif (o == '--caps'):
            print "looking for ABSTRACT and other headers in all caps"
            look_for_all_caps_headings = True

    if automatic_rerunning:
        print "looking for ABSTRACT and other headers in all caps"
        look_for_all_caps_headings = True

    tree = pdf2etree(args)

    # find title - look on the first page of the document at the first block of text on the page
    page = 1
    block = 1
    title_node = None
    while True:
        try:
            trial_title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
            if Verbose_flag:
                print "trial_title_node:"
                print trial_title_node

#            title_headers = trial_title_node.xpath(".//TOKEN[@font-size > {0}]".format(23))
# note that the Title is assumed to be 20 points or larger in size
            title_headers = trial_title_node.xpath(
                ".//TOKEN[@font-size > {0}]".format(20))
            if Verbose_flag:
                print "title_headers:"
                print title_headers
            title_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in title_headers
            ])
            if len(title_head_txt):
                print "<Title>" + title_head_txt + "</Title>"
                title_node = trial_title_node
                next_block = block + 1
                break
        except IndexError:
            page += 1
        else:
            break
        if page > 2:
            # probably not going to find it now
            break

    # find subtitle - note that a subtitle is option - start on the 2nd page and second block on the page
    page = 2
    block = 2
    next_block = 2
    subtitle_node = None
    while True:
        try:
            trial_subtitle_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
            if Verbose_flag:
                print "trial_subtitle_node:"
                print trial_subtitle_node

# the Subtitle is assumed to be larger than 19 points
            subtitle_headers = trial_subtitle_node.xpath(
                ".//TOKEN[@font-size > {0}]".format(19))
            if Verbose_flag:
                print "subtitle_headers:"
                print subtitle_headers

            if len(subtitle_headers) == 0:
                next_block = 2
                break
            subtitle_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in subtitle_headers
            ])
            if len(subtitle_head_txt):
                subtitle_node = trial_subtitle_node
                print "<Subtitle>" + title_head_txt + "</Subtitle>"
                next_block = 3
                break

        except IndexError:
            block += 1
        else:
            break
        if block > 4:
            # probably not going to find it now
            break

    # find author - on inside cover
    page = 2
    block = next_block
    auth_node = None
    while True:
        try:
            trial_auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
            if Verbose_flag:
                print "trial_auth_node:"
                print trial_auth_node

# the author's name(s) is(are) assumed to be 15 points or larger in size
            auth_headers = trial_auth_node.xpath(
                ".//TOKEN[@font-size > {0}]".format(15))
            if Verbose_flag:
                print "auth_headers:"
                print auth_headers
            auth_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in auth_headers
            ])
            if len(title_head_txt):
                auth_node = trial_auth_node
                break

        except IndexError:
            block += 1
        else:
            break
        if block > 4:
            # probably not going to find it now
            break

    font_sizes = tree.xpath('//TOKEN/@font-size')
    mean_font_size = mean(font_sizes)
    median_font_size = median(font_sizes)

    #    print "Median Font Size (i.e. body text):", median_font_size

    font_colors = tree.xpath('//TOKEN/@font-color')
    font_color_hash = {}
    for fc in font_colors:
        try:
            font_color_hash[fc] += 1
        except KeyError:
            font_color_hash[fc] = 1

    sortlist = [(v, k) for k, v in font_color_hash.iteritems()]
    sortlist.sort(reverse=True)
    main_font_color = sortlist[0][1]
    head_txts = []
    stop = False

    page = 0
    Found_abstract = False
    Found_Sammanfattning = False

    for page_node in tree.xpath('//PAGE'):
        page = page + 1
        block_number = 0
        for block_node in page_node.xpath('.//BLOCK'):
            block_number = block_number + 1
            if xmltag:
                if block_node == title_node:
                    st = "<title>"
                    et = "</title>"
                if block_node == subtitle_node:
                    st = "<subtitle>"
                    et = "</subtitle>"
                elif block_node == auth_node:
                    st = "<author>"
                    et = "</author>"
                else:
                    st = "<heading>"
                    et = "</heading>"

                if highlight:
                    st = "\033[0;32m{0}\033[0m".format(st)
                    et = "\033[0;32m{0}\033[0m".format(et)
            else:
                st = et = ""
            if block_node == title_node and authonly:
                continue
# note that the assumption that the Abstract headings is set in a larger font then the median font sized used on a page, will not find
# abstracts of Aalto university - as they set the word ABSTRACT in a slightly larger size font as used for the rest of the text, but they do set it in all CAPs
            if look_for_all_caps_headings:
                headers = block_node.xpath(
                    ".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']"
                    .format(mean_font_size, main_font_color))
            else:
                headers = block_node.xpath(
                    ".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']"
                    .format(mean_font_size * 1.05, main_font_color))

            head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in headers
            ])
            if head_txt in text_start_to_exclude:
                start_to_exclude = True
            head_txt = filter_headings(head_txt)
            if len(head_txt) and (not start_to_exclude):
                head_txts.append("{0}{1}{2}".format(st, head_txt, et))

            if head_txt.find("Abstract") >= 0 or head_txt.find(
                    "ABSTRACT") >= 0:
                if not Found_abstract:
                    print "Abstract (en):"
                    output_blocks_on_page(page_node, block_number, page)
                    Found_abstract = True
                break

            if head_txt.find("Sammanfattning") >= 0 or head_txt.find(
                    "SAMMANFATTNING") >= 0:
                if not Found_Sammanfattning:
                    print "Sammanfattning (sv):"
                    output_blocks_on_page(page_node, block_number, page)
                    Found_Sammanfattning = True
                break

            if head_txt.find("Abstrakt") >= 0 or head_txt.find(
                    "ABSTRAKT") >= 0:
                if not Found_Sammanfattning:
                    print "Abstrakt (sv):"
                    output_blocks_on_page(page_node, block_number, page)
                    Found_Sammanfattning = True
                break

            if head_txt.find("Referat") >= 0 or head_txt.find("REFERAT") >= 0:
                if not Found_Sammanfattning:
                    print "Referat (sv):"
                    output_blocks_on_page(page_node, block_number, page)
                    Found_Sammanfattning = True
                break


#
#            if head_txt.find("Abstracto(sp)") >= 0:
#                    print "Abstracto (sp):"
#                    output_blocks_on_page(page_node, block_number, page)
#                break
#
#            if head_txt.find("Abstrait (fr)") >= 0:
#                    print "Abstrait (fr):"
#                    output_blocks_on_page(page_node, block_number, page)
#                break

            if block_node == title_node and titleonly:
                stop = True
                break
            elif block_node == auth_node and authonly:
                stop = True
                break
        if stop:
            break
    for txt in head_txts:
        sys.stdout.writelines([txt, '\n'])
Exemple #25
0
    def run_job(self):
        is_test_completed = False

        if not self.install_local_pages():
            self.add_failure(
                self.name, TestStatus.TEST_UNEXPECTED_FAIL,
                'Aborting test - Could not install local pages on phone.',
                TreeherderStatus.EXCEPTION)
            return is_test_completed

        if not self.create_profile():
            self.add_failure(self.name, TestStatus.TEST_UNEXPECTED_FAIL,
                             'Aborting test - Could not run Fennec.',
                             TreeherderStatus.BUSTED)
            return is_test_completed

        perfherder_options = PerfherderOptions(self.perfherder_options,
                                               repo=self.build.tree)
        is_test_completed = True
        testcount = len(self._urls.keys())
        for testnum, (testname, url) in enumerate(self._urls.iteritems(), 1):
            self.loggerdeco = self.loggerdeco.clone(
                extradict={
                    'phoneid': self.phone.id,
                    'buildid': self.build.id,
                    'testname': testname
                },
                extraformat=
                'S1S2TestJob|%(phoneid)s|%(buildid)s|%(testname)s|%(message)s')
            self.dm._logger = self.loggerdeco
            self.loggerdeco.info('Running test (%d/%d) for %d iterations',
                                 testnum, testcount, self._iterations)

            command = None
            for attempt in range(1, self.stderrp_attempts + 1):
                # dataset is a list of the measurements made for the
                # iterations for this test.
                #
                # An empty item in the dataset list represents a
                # failure to obtain any measurement for that
                # iteration.
                #
                # It is possible for an item in the dataset to have an
                # uncached value and not have a corresponding cached
                # value if the cached test failed to record the
                # values.

                iteration = 0
                dataset = []
                for iteration in range(1, self._iterations + 1):
                    # Calling svc power stayon true will turn on the
                    # display for at least some devices if it has
                    # turned off.
                    self.dm.power_on()
                    command = self.worker_subprocess.process_autophone_cmd(
                        test=self, require_ip_address=url.startswith('http'))
                    if command['interrupt']:
                        self.handle_test_interrupt(command['reason'],
                                                   command['test_result'])
                        break
                    self.update_status(message='Attempt %d/%d for Test %d/%d, '
                                       'run %d, for url %s' %
                                       (attempt, self.stderrp_attempts,
                                        testnum, testcount, iteration, url))

                    if not self.create_profile():
                        self.add_failure(self.name,
                                         TestStatus.TEST_UNEXPECTED_FAIL,
                                         'Failed to create profile',
                                         TreeherderStatus.TESTFAILED)
                        continue

                    measurement = self.runtest(url)
                    if not measurement:
                        self.loggerdeco.warning(
                            '%s %s Attempt %s Failed to get uncached measurement.',
                            testname, url, attempt)
                        continue

                    self.add_pass(url)
                    dataset.append({'uncached': measurement})

                    measurement = self.runtest(url)
                    if not measurement:
                        self.loggerdeco.warning(
                            '%s %s Attempt %s Failed to get cached measurement.',
                            testname, url, attempt)
                        continue

                    self.add_pass(url)
                    dataset[-1]['cached'] = measurement

                    if self.is_stderr_below_threshold(
                        ('throbberstart', 'throbberstop'), dataset,
                            self.stderrp_accept):
                        self.loggerdeco.info(
                            'Accepted test (%d/%d) after %d of %d iterations',
                            testnum, testcount, iteration, self._iterations)
                        break

                if command and command['interrupt']:
                    break
                measurements = len(dataset)
                if measurements > 0 and self._iterations != measurements:
                    self.add_failure(self.name,
                                     TestStatus.TEST_UNEXPECTED_FAIL,
                                     'Failed to get all measurements',
                                     TreeherderStatus.TESTFAILED)
                elif measurements == 0:
                    # If we have not gotten a single measurement at this point,
                    # just bail and report the failure rather than wasting time
                    # continuing more attempts.
                    self.add_failure(self.name,
                                     TestStatus.TEST_UNEXPECTED_FAIL,
                                     'No measurements detected.',
                                     TreeherderStatus.BUSTED)
                    self.loggerdeco.info(
                        'Failed to get measurements for test %s after %d/%d attempt '
                        'of %d iterations', testname, attempt,
                        self.stderrp_attempts, self._iterations)
                    self.worker_subprocess.mailer.send(
                        '%s %s failed for Build %s %s on %s %s' %
                        (self.__class__.__name__, testname, self.build.tree,
                         self.build.id, utils.host(), self.phone.id),
                        'No measurements were detected for test %s.\n\n'
                        'Job        %s\n'
                        'Host       %s\n'
                        'Phone      %s\n'
                        'Repository %s\n'
                        'Build      %s\n'
                        'Revision   %s\n' %
                        (testname, self.job_url, utils.host(), self.phone.id,
                         self.build.tree, self.build.id, self.build.changeset))
                    break

                if self.is_stderr_below_threshold(
                    ('throbberstart', 'throbberstop'), dataset,
                        self.stderrp_reject):
                    rejected = False
                else:
                    rejected = True
                    self.loggerdeco.info(
                        'Rejected test (%d/%d) after %d/%d iterations',
                        testnum, testcount, iteration, self._iterations)

                self.loggerdeco.debug('publishing results')

                perfherder_values = {'geometric_mean': 0}
                metric_keys = ['throbberstart', 'throbberstop', 'throbbertime']
                cache_names = {'uncached': 'first', 'cached': 'second'}
                cache_keys = cache_names.keys()

                for metric_key in metric_keys:
                    perfherder_values[metric_key] = {'geometric_mean': 0}
                    for cache_key in cache_keys:
                        perfherder_values[metric_key][cache_key] = {
                            'median': 0,
                            'values': []
                        }

                for datapoint in dataset:
                    for cache_key in datapoint:
                        starttime = datapoint[cache_key]['starttime']
                        throbberstart = datapoint[cache_key]['throbberstart']
                        throbberstop = datapoint[cache_key]['throbberstop']
                        self.report_results(
                            starttime=starttime,
                            tstrt=throbberstart,
                            tstop=throbberstop,
                            testname=testname,
                            cache_enabled=(cache_key == 'cached'),
                            rejected=rejected)
                        perfherder_values['throbberstart'][cache_key][
                            'values'].append(throbberstart - starttime)
                        perfherder_values['throbberstop'][cache_key][
                            'values'].append(throbberstop - starttime)
                        perfherder_values['throbbertime'][cache_key][
                            'values'].append(throbberstop - throbberstart)

                test_values = []
                for metric_key in metric_keys:
                    for cache_key in cache_keys:
                        perfherder_values[metric_key][cache_key][
                            'median'] = utils.median(
                                perfherder_values[metric_key][cache_key]
                                ['values'])
                    perfherder_values[metric_key][
                        'geometric_mean'] = utils.geometric_mean([
                            perfherder_values[metric_key]['uncached']
                            ['median'],
                            perfherder_values[metric_key]['cached']['median']
                        ])
                    test_values.append(
                        perfherder_values[metric_key]['geometric_mean'])

                perfherder_suite = PerfherderSuite(
                    name=testname,
                    value=utils.geometric_mean(test_values),
                    options=perfherder_options)
                for metric_key in metric_keys:
                    for cache_key in cache_keys:
                        cache_name = cache_names[cache_key]
                        subtest_name = "%s %s" % (metric_key, cache_name)
                        perfherder_suite.add_subtest(
                            subtest_name,
                            perfherder_values[metric_key][cache_key]['median'],
                            options=perfherder_options)

                self.perfherder_artifact = PerfherderArtifact()
                self.perfherder_artifact.add_suite(perfherder_suite)
                self.loggerdeco.debug("PerfherderArtifact: %s",
                                      self.perfherder_artifact)

                if not rejected:
                    break

            if command and command['interrupt']:
                break

        return is_test_completed
Exemple #26
0
seasoned_snr = snr(image, seasoned_image)
print("Salt and pepper SNR: " + str(seasoned_snr) + " dB")

# Generate Gaussian noise
gaussed_image = random_noise(image, mode='gaussian', seed=0)
gaussed_snr = snr(image, gaussed_image)
print("Gaussian SNR: " + str(gaussed_snr) + " dB")

# Apply a median filter over image
# 5x5 averaging filter kernel (low pass)
avg_kernel = np.ones((5, 5)) / 25.0
averaged_simage = conv(seasoned_image, avg_kernel)
averaged_gimage = conv(gaussed_image, avg_kernel)

# Apply a median filter over image
median_simage = median(seasoned_image, 5)
median_gimage = median(gaussed_image, 5)

# Sobel edge detection filters
sx_kernel = [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]

sy_kernel = [[1, 2, 1], [0, 0, 0], [-1, -2, -1]]

# Sobel edge filter on noisy images
sx = np.abs(conv(seasoned_image, sx_kernel))
sx = threshold(sx, 0.3)
sy = np.abs(conv(seasoned_image, sy_kernel))
sy = threshold(sy, 0.3)
sobel_simage = sx + sy
sx = np.abs(conv(gaussed_image, sx_kernel))
sx = threshold(sx, 0.3)
Exemple #27
0
    def analyze_logcat(self):
        """
I/GeckoDump( 2284): __start_tp_report
I/GeckoDump( 2284): _x_x_mozilla_page_load
I/GeckoDump( 2284): _x_x_mozilla_page_load_details
I/GeckoDump( 2284): |i|pagename|runs|
I/GeckoDump( 2284): |0;amazon.com/www.amazon.com/index.html;2386;1146
I/GeckoDump( 2284): |1;m.yahoo.co.jp/www.yahoo.co.jp/index.html;1724;901
I/GeckoDump( 2284): |2;m.accuweather.com/www.accuweather.com/index.html;228;231
I/GeckoDump( 2284): |3;m.yandex.ru/www.yandex.ru/index.html;6043;2984
I/GeckoDump( 2284): |4;m.wikipedia.com/en.m.wikipedia.org/index.html;734;385
I/GeckoDump( 2284): |5;m.espn.com/m.espn.go.com/index.html;576;419
I/GeckoDump( 2284): |6;m.bbc.co.uk/www.bbc.co.uk/mobile/index.html;349;229
I/GeckoDump( 2284): __end_tp_report
I/GeckoDump( 2284): __start_cc_report
I/GeckoDump( 2284): _x_x_mozilla_cycle_collect,3390
I/GeckoDump( 2284): __end_cc_report
I/GeckoDump( 2284): __startTimestamp1433438438092__endTimestamp

        We will parse the syntax here and build up a {name:[value,],} hash.
        Next we will compute the median value for each name.
        Finally we will report the geoemtric mean of all of the median values.
        """
        self.loggerdeco.debug('analyzing logcat')

        re_page_data = re.compile('.*\|[0-9];([a-zA-Z0-9\.\/\-]+);([0-9;]+).*')
        re_end_report = re.compile('.*__end_tp_report.*')

        attempt = 1
        max_time = 90  # maximum time to wait for completeness score
        wait_time = 3  # time to wait between attempts
        max_attempts = max_time / wait_time

        results = {}
        pageload_metric = {'summary': 0}
        while attempt <= max_attempts and pageload_metric['summary'] == 0:
            buf = self.logcat.get()
            for line in buf:
                self.loggerdeco.debug('analyze_logcat: %s' % line)
                if re_end_report.match(line):
                    # calculate score
                    data = []
                    for page in results:
                        data.append(median(results[page]))
                        pageload_metric[page] = median(results[page])
                    pageload_metric['summary'] = geometric_mean(data)
                    break

                match = re_page_data.match(line)
                if match:
                    page_name = match.group(1)
                    numbers = match.group(2)
                    if page_name and numbers:
                        page_name = page_name.split('/')[0]
                        numbers = [float(x) for x in numbers.split(';')]
                        results[page_name] = numbers

            if self.fennec_crashed:
                # If fennec crashed, don't bother looking for pageload metric
                break
            if pageload_metric['summary'] == 0:
                sleep(wait_time)
                attempt += 1
        if pageload_metric['summary'] == 0:
            self.loggerdeco.warning('Unable to find pageload metric')

        return pageload_metric
Exemple #28
0
    def __do_divide_conquer(self, P):
        if len(P) <= 3: return self.__do_graham_scan(P)
        # divide
        m = utils.median([p.X for p in P])
        PL = [p for p in P if p.X <= m]
        PR = [p for p in P if p.X > m]
        QL = self.__do_divide_conquer(PL)
        QR = self.__do_divide_conquer(PR)
        # calculate polar angle
        i = np.argmin([p.Y for p in QL])
        j = np.argmin([p.Y for p in QR])
        if QL[i].Y > QR[j].Y:
            tmp = QL
            QL = QR
            QR = tmp
            i = j
        X = Point(QL[i].X + 1, QL[i].Y)
        O = QL[i]
        QL_pa = utils.calc_polar_angle(O, X, QL)
        QR_pa = utils.calc_polar_angle(O, X, QR)
        s = np.argmin(QR_pa)  # min polar angle in QR
        t = np.argmax(QR_pa)  # max polar angle in QR
        
        # merge
        QL = np.concatenate((QL[i:], QL[:i]))   # arrange in ascending polar angle order
        QL_pa = np.concatenate((QL_pa[i:], QL_pa[:i]))
        if s < t:
            QR_1 = QR[s:t]
            QR_2 = np.concatenate((QR[t:], QR[:s]))[::-1]
            QR_pa_1 = QR_pa[s:t]
            QR_pa_2 = np.concatenate((QR_pa[t:], QR_pa[:s]))[::-1]
        else:
            QR_1 = np.concatenate((QR[s:], QR[:t]))
            QR_2 = QR[t:s][::-1]
            QR_pa_1 = np.concatenate((QR_pa[s:], QR_pa[:t]))
            QR_pa_2 = QR_pa[t:s][::-1]

        l_len = len(QL)
        r_len_1 = len(QR_1)
        r_len_2 = len(QR_2)
        l_it = 0
        r_it_1 = 0
        r_it_2 = 0
        
        W = list()
        while True:
            if l_it >= l_len:
                if r_it_1 >= r_len_1: # extend r_2
                    W.extend(QR_2[r_it_2:])
                    r_it_2 = r_len_2
                    break
                elif r_it_2 >= r_len_2:   # extend r_1
                    W.extend(QR_1[r_it_1:])
                    r_it_1 = r_len_1
                    break
                else:   # append r_1 and r_2
                    if QR_pa_1[r_it_1] < QR_pa_2[r_it_2]:
                        W.append(QR_1[r_it_1])
                        r_it_1 += 1
                    else:
                        W.append(QR_2[r_it_2])
                        r_it_2 += 1
            elif r_it_1 >= r_len_1:
                if r_it_2 >= r_len_2:   # extend l
                    W.extend(QL[l_it:])
                    l_it = l_len
                    break
                else:   # append l and r_2
                    if QL_pa[l_it] < QR_pa_2[r_it_2]:
                        W.append(QL[l_it])
                        l_it += 1
                    else:
                        W.append(QR_2[r_it_2])
                        r_it_2 += 1
            elif r_it_2 >= r_len_2:
                if QL_pa[l_it] < QR_pa_1[r_it_1]:
                    W.append(QL[l_it])
                    l_it += 1
                else:
                    W.append(QR_1[r_it_1])
                    r_it_1 += 1
            else:   # append l, r_1 and r_2
                if QL_pa[l_it] < QR_pa_1[r_it_1] and QL_pa[l_it] < QR_pa_2[r_it_2]:
                    W.append(QL[l_it])
                    l_it += 1
                elif QR_pa_1[r_it_1] < QL_pa[l_it] and QR_pa_1[r_it_1] < QR_pa_2[r_it_2]:
                    W.append(QR_1[r_it_1])
                    r_it_1 += 1
                else:
                    W.append(QR_2[r_it_2])
                    r_it_2 += 1
        return self.__do_graham_scan(W, sort=False)
def pdf2heads(opts, args):
    xmltag = True
    highlight = False
    titleonly = False
    authonly = False
    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
            if (o == '--title'):
                titleonly = True
            elif (o == '--author'):
                authonly = True

    tree = pdf2etree(args)

    # find title
    page = 1
    block = 1
    title_node = None
    while True:
        try: title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0]
    except IndexError: page+=1
else: break
if page > 2:
    # probably not going to find it now
    break

    # find author
    page = 1
    block = 2
    auth_node = None
    while True:
        try: auth_node  = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0]
    except InbdexError: block+=1
else: break
if block > 4:
    # probably not going to find it now
    break

    font_sizes = tree.xpath('//TOKEN/@font-size')
    mean_font_size =  mean(font_sizes)
    median_font_size = median(font_sizes)

    #print "Median Font Size (i.e. body text):", median_font_size

    font_colors = tree.xpath('//TOKEN/@font-color')
    font_color_hash = {}
    for fc in font_colors:
        try:
            font_color_hash[fc]+=1
        except KeyError:
            font_color_hash[fc] = 1

    sortlist = [(v,k) for k,v in font_color_hash.iteritems()]
    sortlist.sort(reverse=True)
    main_font_color = sortlist[0][1]
    head_txts = []
    stop = False
    for page_node in tree.xpath('//PAGE'):
        for block_node in page_node.xpath('.//BLOCK'):
            if xmltag:
                if block_node == title_node:
                    st = "<title>"
                    et = "</title>"
                elif block_node == auth_node:
                    st = "<author>"
                    et = "</author>"
                else:
                    st = "<heading>"
                    et = "</heading>"

                if highlight:
                    st = "\033[0;32m{0}\033[0m".format(st)
                    et = "\033[0;32m{0}\033[0m".format(et)
                else:
                    st = et = ""
                    if block_node == title_node and authonly:
                        continue
                        headers = block_node.xpath(".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']".format(mean_font_size*1.05, main_font_color))
                        head_txt = ' '.join([etree.tostring(el, method='text', encoding="UTF-8") for el in headers])
                        if len(head_txt):
                            head_txts.append("{0}{1}{2}".format(st, head_txt, et))

            if block_node == title_node and titleonly:
                stop = True
                break
            elif block_node == auth_node and authonly:
                stop = True
                break
                if stop:
                    break
                    for txt in head_txts:
                        sys.stdout.writelines([txt, '\n'])

def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
        try:
            try:
                opts, args = getopt.getopt(argv, "ht", ["help", "test", "noxml", "highlight", "title", "author"])
            except getopt.error as msg:
                raise UsageError(msg)
                for o, a in opts:
                    if (o in ['-h', '--help']):
                        # print help and exit
                        sys.stdout.write(__doc__)
                        sys.stdout.flush()
                        return 0

        pdf2heads(opts, args)

    except UsageError as err:
        print >>sys.stderr, err.msg
        print >>sys.stderr, "for help use --help"
        return 2
    except ConfigError, err:
        sys.stderr.writelines([str(err.msg),'\n'])
        sys.stderr.flush()
        return 1
def pdf2heads(opts, args):
    xmltag = True
    highlight = False
    titleonly = False
    authonly = False
    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
        if (o == '--title'):
            titleonly = True
        elif (o == '--author'):
            authonly = True

    tree = pdf2etree(args)

    # find title
    page = 1
    block = 1
    title_node = None
    while True:
        try: title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0]
        except IndexError: page+=1
        else: break
        if page > 2:
            # probably not going to find it now
            break
        
    # find author
    page = 1
    block = 2
    auth_node = None
    while True:
        try: auth_node  = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0]
        except InbdexError: block+=1
        else: break
        if block > 4:
            # probably not going to find it now
            break
    
    font_sizes = tree.xpath('//TOKEN/@font-size')
    mean_font_size =  mean(font_sizes)
    median_font_size = median(font_sizes)

    #print "Median Font Size (i.e. body text):", median_font_size

    font_colors = tree.xpath('//TOKEN/@font-color')
    font_color_hash = {}
    for fc in font_colors:
        try:
            font_color_hash[fc]+=1
        except KeyError:
            font_color_hash[fc] = 1

    sortlist = [(v,k) for k,v in font_color_hash.iteritems()]
    sortlist.sort(reverse=True)
    main_font_color = sortlist[0][1]
    head_txts = []
    stop = False
    for page_node in tree.xpath('//PAGE'):
        for block_node in page_node.xpath('.//BLOCK'):
            if xmltag:
                if block_node == title_node:
                    st = "<title>"
                    et = "</title>"
                elif block_node == auth_node:
                    st = "<author>"
                    et = "</author>"
                else:
                    st = "<heading>"
                    et = "</heading>"
                    
                if highlight:
                    st = "\033[0;32m{0}\033[0m".format(st)
                    et = "\033[0;32m{0}\033[0m".format(et)
            else:
                st = et = ""
            if block_node == title_node and authonly:
                continue
            headers = block_node.xpath(".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']".format(mean_font_size*1.05, main_font_color))
            head_txt = ' '.join([etree.tostring(el, method='text', encoding="UTF-8") for el in headers])
            if len(head_txt):
                head_txts.append("{0}{1}{2}".format(st, head_txt, et))
                
            if block_node == title_node and titleonly:
                stop = True
                break
            elif block_node == auth_node and authonly:
                stop = True
                break
        if stop:
            break
    for txt in head_txts:
        sys.stdout.writelines([txt, '\n'])
    def analyze_logcat(self):
        """
I/GeckoDump( 2284): __start_tp_report
I/GeckoDump( 2284): _x_x_mozilla_page_load
I/GeckoDump( 2284): _x_x_mozilla_page_load_details
I/GeckoDump( 2284): |i|pagename|runs|
I/GeckoDump( 2284): |0;amazon.com/www.amazon.com/index.html;2386;1146
I/GeckoDump( 2284): |1;m.yahoo.co.jp/www.yahoo.co.jp/index.html;1724;901
I/GeckoDump( 2284): |2;m.accuweather.com/www.accuweather.com/index.html;228;231
I/GeckoDump( 2284): |3;m.yandex.ru/www.yandex.ru/index.html;6043;2984
I/GeckoDump( 2284): |4;m.wikipedia.com/en.m.wikipedia.org/index.html;734;385
I/GeckoDump( 2284): |5;m.espn.com/m.espn.go.com/index.html;576;419
I/GeckoDump( 2284): |6;m.bbc.co.uk/www.bbc.co.uk/mobile/index.html;349;229
I/GeckoDump( 2284): __end_tp_report
I/GeckoDump( 2284): __start_cc_report
I/GeckoDump( 2284): _x_x_mozilla_cycle_collect,3390
I/GeckoDump( 2284): __end_cc_report
I/GeckoDump( 2284): __startTimestamp1433438438092__endTimestamp

        We will parse the syntax here and build up a {name:[value,],} hash.
        Next we will compute the median value for each name.
        Finally we will report the geoemtric mean of all of the median values.
        """
        self.loggerdeco.debug('analyzing logcat')

        re_page_data = re.compile(
            r'.*\|[0-9];([a-zA-Z0-9\.\/\-]+);([0-9;]+).*')
        re_end_report = re.compile(r'.*__end_tp_report.*')

        attempt = 1
        max_time = 180  # maximum time to wait for tp report
        wait_time = 3  # time to wait between attempts
        max_attempts = max_time / wait_time

        results = {}
        pageload_metric = {'summary': 0}
        while attempt <= max_attempts and pageload_metric['summary'] == 0:
            buf = self.worker_subprocess.logcat.get()
            for line in buf:
                self.loggerdeco.debug('analyze_logcat: %s', line)
                if re_end_report.match(line):
                    # calculate score
                    data = []
                    for page in results:
                        data.append(median(results[page]))
                        # median of each page, ignoring the first run
                        pageload_metric[page] = median(results[page][1:])
                    pageload_metric['summary'] = geometric_mean(data)
                    break

                match = re_page_data.match(line)
                if match:
                    page_name = match.group(1)
                    numbers = match.group(2)
                    if page_name and numbers:
                        page_name = page_name.split('/')[0]
                        numbers = [float(x) for x in numbers.split(';')]
                        results[page_name] = numbers

            if self.handle_crashes():
                # If fennec crashed, don't bother looking for pageload metric
                break
            if pageload_metric['summary'] == 0:
                sleep(wait_time)
                attempt += 1
        if pageload_metric['summary'] == 0:
            self.loggerdeco.warning('Unable to find pageload metric')

        return pageload_metric
Exemple #32
0
    def run_job(self):
        is_test_completed = False

        if not self.install_local_pages():
            self.add_failure(
                self.name, TestStatus.TEST_UNEXPECTED_FAIL,
                'Aborting test - Could not install local pages on phone.',
                TreeherderStatus.EXCEPTION)
            return is_test_completed

        if not self.create_profile():
            self.add_failure(
                self.name, TestStatus.TEST_UNEXPECTED_FAIL,
                'Aborting test - Could not run Fennec.',
                TreeherderStatus.BUSTED)
            return is_test_completed

        perfherder_options = PerfherderOptions(self.perfherder_options,
                                               repo=self.build.tree)
        is_test_completed = True
        testcount = len(self._urls.keys())
        for testnum, (testname, url) in enumerate(self._urls.iteritems(), 1):
            self.loggerdeco = self.loggerdeco.clone(
                extradict={
                    'repo': self.build.tree,
                    'buildid': self.build.id,
                    'buildtype': self.build.type,
                    'sdk': self.phone.sdk,
                    'platform': self.build.platform,
                    'testname': testname
                },
                extraformat='S1S2TestJob %(repo)s %(buildid)s %(buildtype)s %(sdk)s %(platform)s %(testname)s %(message)s')
            self.dm._logger = self.loggerdeco
            self.loggerdeco.info('Running test (%d/%d) for %d iterations',
                                 testnum, testcount, self._iterations)

            command = None
            for attempt in range(1, self.stderrp_attempts+1):
                # dataset is a list of the measurements made for the
                # iterations for this test.
                #
                # An empty item in the dataset list represents a
                # failure to obtain any measurement for that
                # iteration.
                #
                # It is possible for an item in the dataset to have an
                # uncached value and not have a corresponding cached
                # value if the cached test failed to record the
                # values.

                iteration = 0
                dataset = []
                for iteration in range(1, self._iterations+1):
                    # Calling svc power stayon true will turn on the
                    # display for at least some devices if it has
                    # turned off.
                    self.dm.power_on()
                    command = self.worker_subprocess.process_autophone_cmd(
                        test=self, require_ip_address=url.startswith('http'))
                    if command['interrupt']:
                        self.handle_test_interrupt(command['reason'],
                                                   command['test_result'])
                        break
                    self.update_status(message='Attempt %d/%d for Test %d/%d, '
                                       'run %d, for url %s' %
                                       (attempt, self.stderrp_attempts,
                                        testnum, testcount, iteration, url))

                    if not self.create_profile():
                        self.add_failure(
                            self.name,
                            TestStatus.TEST_UNEXPECTED_FAIL,
                            'Failed to create profile',
                            TreeherderStatus.TESTFAILED)
                        continue

                    measurement = self.runtest(url)
                    if not measurement:
                        self.loggerdeco.warning(
                            '%s %s Attempt %s Failed to get uncached measurement.',
                            testname, url, attempt)
                        continue

                    self.add_pass(url, text='uncached')
                    dataset.append({'uncached': measurement})

                    measurement = self.runtest(url)
                    if not measurement:
                        self.loggerdeco.warning(
                            '%s %s Attempt %s Failed to get cached measurement.',
                            testname, url, attempt)
                        continue

                    self.add_pass(url, text='cached')
                    dataset[-1]['cached'] = measurement

                    if self.is_stderr_below_threshold(
                            ('throbberstart',
                             'throbberstop'),
                            dataset,
                            self.stderrp_accept):
                        self.loggerdeco.info(
                            'Accepted test (%d/%d) after %d of %d iterations',
                            testnum, testcount, iteration, self._iterations)
                        break

                if command and command['interrupt']:
                    break
                measurements = len(dataset)
                if measurements > 0 and self._iterations != measurements:
                    self.add_failure(
                        self.name,
                        TestStatus.TEST_UNEXPECTED_FAIL,
                        'Failed to get all measurements',
                        TreeherderStatus.TESTFAILED)
                elif measurements == 0:
                    # If we have not gotten a single measurement at this point,
                    # just bail and report the failure rather than wasting time
                    # continuing more attempts.
                    self.add_failure(
                        self.name, TestStatus.TEST_UNEXPECTED_FAIL,
                        'No measurements detected.',
                        TreeherderStatus.BUSTED)
                    self.loggerdeco.info(
                        'Failed to get measurements for test %s after %d/%d attempt '
                        'of %d iterations', testname, attempt,
                        self.stderrp_attempts, self._iterations)
                    self.worker_subprocess.mailer.send(
                        '%s %s failed for Build %s %s on %s %s' %
                        (self.__class__.__name__, testname, self.build.tree,
                         self.build.id, utils.host(), self.phone.id),
                        'No measurements were detected for test %s.\n\n'
                        'Job        %s\n'
                        'Host       %s\n'
                        'Phone      %s\n'
                        'Repository %s\n'
                        'Build      %s\n'
                        'Revision   %s\n' %
                        (testname,
                         self.job_url,
                         utils.host(),
                         self.phone.id,
                         self.build.tree,
                         self.build.id,
                         self.build.changeset))
                    break

                if self.is_stderr_below_threshold(
                        ('throbberstart',
                         'throbberstop'),
                        dataset,
                        self.stderrp_reject):
                    rejected = False
                else:
                    rejected = True
                    self.loggerdeco.info(
                        'Rejected test (%d/%d) after %d/%d iterations',
                        testnum, testcount, iteration, self._iterations)

                self.loggerdeco.debug('publishing results')

                perfherder_values = {'geometric_mean': 0}
                metric_keys = ['throbberstart', 'throbberstop', 'throbbertime']
                cache_names = {'uncached': 'first', 'cached': 'second'}
                cache_keys = cache_names.keys()

                for metric_key in metric_keys:
                    perfherder_values[metric_key] = {'geometric_mean': 0}
                    for cache_key in cache_keys:
                        perfherder_values[metric_key][cache_key] = {'median': 0, 'values': []}

                for datapoint in dataset:
                    for cache_key in datapoint:
                        starttime = datapoint[cache_key]['starttime']
                        throbberstart = datapoint[cache_key]['throbberstart']
                        throbberstop = datapoint[cache_key]['throbberstop']
                        self.report_results(
                            starttime=starttime,
                            tstrt=throbberstart,
                            tstop=throbberstop,
                            testname=testname,
                            cache_enabled=(cache_key == 'cached'),
                            rejected=rejected)
                        perfherder_values['throbberstart'][cache_key]['values'].append(
                            throbberstart - starttime)
                        perfherder_values['throbberstop'][cache_key]['values'].append(
                            throbberstop - starttime)
                        perfherder_values['throbbertime'][cache_key]['values'].append(
                            throbberstop - throbberstart)

                test_values = []
                for metric_key in metric_keys:
                    for cache_key in cache_keys:
                        perfherder_values[metric_key][cache_key]['median'] = utils.median(
                            perfherder_values[metric_key][cache_key]['values'])
                    perfherder_values[metric_key]['geometric_mean'] = utils.geometric_mean(
                        [perfherder_values[metric_key]['uncached']['median'],
                         perfherder_values[metric_key]['cached']['median']])
                    test_values.append(perfherder_values[metric_key]['geometric_mean'])

                perfherder_suite = PerfherderSuite(name=testname,
                                                   value=utils.geometric_mean(test_values),
                                                   options=perfherder_options)
                for metric_key in metric_keys:
                    for cache_key in cache_keys:
                        cache_name = cache_names[cache_key]
                        subtest_name = "%s %s" % (metric_key, cache_name)
                        perfherder_suite.add_subtest(
                            subtest_name,
                            perfherder_values[metric_key][cache_key]['median'],
                            options=perfherder_options)

                self.perfherder_artifact = PerfherderArtifact()
                self.perfherder_artifact.add_suite(perfherder_suite)
                self.loggerdeco.debug("PerfherderArtifact: %s", self.perfherder_artifact)

                if not rejected:
                    break

            if command and command['interrupt']:
                break

        return is_test_completed
def extract_dataframes():

	for pid in pids:
		print ()
		print ('pid: ', pid)
		tac_reading = pd.read_csv('clean_tac/' + pid + '_clean_TAC.csv')
		acc_data = pd.read_csv('accelerometer/accelerometer_' + pid + '.csv')

		tac_labels = []

		for feat_no, feature in enumerate(features):
			print ('   feature:', feature)
			array_long = []

			for ind, row in tac_reading.iterrows():
				
				if ind!=0:
				
					t1, t2 = prev_row['timestamp'], row['timestamp']
					long_data = acc_data[ (acc_data['time']/1000 >= t1) & (acc_data['time']/1000 < t2) ]

					if not long_data.empty:
						
						if feat_no==0:
							if prev_row['TAC_Reading'] >= 0.08:
								tac_labels.append(1)
							else:
								tac_labels.append(0) 

						if feature=='rms':
							lt = []
							for axis in ['x', 'y', 'z']:
								lt.append(utils.rms(long_data[axis]))

							lt = np.array(lt)
							array_long.append(lt)

						else:
							short_datas = np.array_split(long_data, 300)
							
							# stores the features for every 1 second in 10 second segment
							array_short = []

							for short_seg, short_data in enumerate(short_datas):

								# data_short = data_long[data_long['short_segment']==short_seg]

								lt = []
								for axis in ['x', 'y', 'z']:
									data_axis =	np.array(short_data[axis])

									if feature=='mean':
										lt.append(utils.mean_feature(data_axis))
									elif feature=='std':
										lt.append(utils.std(data_axis))
									elif feature=='median':
										lt.append(utils.median(data_axis))
									elif feature=='crossing_rate':
										lt.append(utils.crossing_rate(data_axis))
									elif feature=='max_abs':
										lt.append(utils.max_abs(data_axis))
									elif feature=='min_abs':
										lt.append(utils.min_abs(data_axis))
									elif feature=='max_raw':
										lt.append(utils.max_raw(data_axis))
									elif feature=='min_raw':
										lt.append(utils.min_raw(data_axis))
									elif feature=='spec_entrp_freq':
										lt.append(utils.spectral_entropy_freq(data_axis))
									elif feature=='spec_entrp_time':
										lt.append(utils.spectral_entropy_time(data_axis))
									elif feature=='spec_centroid':
										lt.append(utils.spectral_centroid(data_axis))
									elif feature=='spec_spread':
										lt.append(utils.spectral_spread(data_axis))
									elif feature=='spec_rolloff':
										lt.append(utils.spectral_rolloff(data_axis))
									elif feature=='max_freq':
										lt.append(utils.max_freq(data_axis))
									elif feature=='spec_flux':
										if short_seg==0:
											lt.append(utils.spectral_flux(data_axis, np.zeros(len(data_axis))))
											if axis=='x':
												x = data_axis
											elif axis=='y':
												y = data_axis
											elif axis=='z':
												z = data_axis
										else:
											if axis=='x':
												if len(data_axis) > len(x):
													zeros = np.zeros(len(data_axis) - len(x))
													x = np.append(x, zeros)
												elif len(data_axis) < len(x):
													zeros = np.zeros(len(x) - len(data_axis))
													data_axis = np.append(data_axis, zeros)

												lt.append(utils.spectral_flux(data_axis, x))
											elif axis=='y':
												if len(data_axis) > len(y):
													zeros = np.zeros(len(data_axis) - len(y))
													y = np.append(y, zeros)
												elif len(data_axis) < len(y):
													zeros = np.zeros(len(y) - len(data_axis))
													data_axis = np.append(data_axis, zeros)

												lt.append(utils.spectral_flux(data_axis, y))
											elif axis=='z':
												if len(data_axis) > len(z):
													zeros = np.zeros(len(data_axis) - len(z))
													z = np.append(z, zeros)
												elif len(data_axis) < len(z):
													zeros = np.zeros(len(z) - len(data_axis))
													data_axis = np.append(data_axis, zeros)

												lt.append(utils.spectral_flux(data_axis, z))


								array_short.append(np.array(lt))
							
							short_metric = np.array(array_short)
							array_long.append(short_metric)

				prev_row = row
		
			if feature=='rms':
				df = pd.DataFrame(columns=['Rms_x', 'Rms_y', 'Rms_z'])
				long_metric = np.array(array_long)

				df['Rms_x'] = long_metric[:,0:1].flatten()
				df['Rms_y'] = long_metric[:,1:2].flatten()
				df['Rms_z'] = long_metric[:,2:].flatten()

				df.to_csv('features/' + feature + '_feature.csv', index=False)
			else:
				long_metric = np.array(array_long)

				summary_stats(long_metric, feature, pid)
		
		print ('   tac_labels: ', len(tac_labels))
		rename_column_and_concat(pid, tac_labels)
def pdf2heads(opts, args, document):
    global Verbose_flag
    global test_flag
    xmltag = True
    highlight = False
    titleonly = False
    authonly = False
    Verbose_flag = False
    test_flag = False
    global look_for_all_caps_headings
    look_for_all_caps_headings = False
    global automatic_rerunning
    global Found_Heading
    global Found_abstract
    global Found_org
    global Found_key
    global Found_Author
    global Found_Level
    global Found_Sammanfattning
    global Found_Method
    global Found_Introduction
    global Found_TOC
    global abstractOut_path
    global OrgandSup_path
    global referat_path
    global methodOut_path
    global introductionOut_path
    global toc_path
    global heading_path
    global title_path
    global author_path
    global subtitle_path
    global end_tag
    global tree
    global mean_font_size
    global main_font_color
    global document_type
    global mean_font_size
    global author
    author = ""

    document_type = document

    start_to_exclude = False

    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
        if (o == '--title'):
            titleonly = True
        elif (o == '--author'):
            authonly = True
        elif (o == '--unittest'):
            test_flag = True
        elif (o == '--verbose'):
            Verbose_flag = True
            print "Verbose_flag is on"
        elif (o == '--caps'):
            print "looking for ABSTRACT and other headers in all caps"
            look_for_all_caps_headings = True

    if automatic_rerunning:
        print "looking for ABSTRACT and other headers in all caps"
        look_for_all_caps_headings = True

    tree = pdf2etree(args)
    global title_head_txt

    # find title - look on the first page of the document at the first block of text on the page
    page = 1
    block = 1
    title_node = None
    while (page < 2):
        try:
            trial_title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]

            if Verbose_flag:  #verse flag
                print "trial_title_node:"
                print trial_title_node

#            title_headers = trial_title_node.xpath(".//TOKEN[@font-size > {0}]".format(23))
# note that the Title is assumed to be 20 points or larger in size
            title_headers = trial_title_node.xpath(
                ".//TOKEN[(@font-size > {0} and @bold = 'yes') or (@font-size > {1} and @bold = 'yes')]"
                .format(20, 15))

            if Verbose_flag:  #verse flag
                print "title_headers:"
                print title_headers

            title_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in title_headers
            ])

            if len(title_head_txt):  #sucess title found
                print "Title: found"
                title_path = '../../../../output/parse_result/' + directiory + '/title.txt'
                txt = title_head_txt
                st = 'title'
                json_append(st, txt)
                # with open(title_path, 'w') as f:
                #     print txt+ "\n"  # print tag information to certain file
                #     print >> f, txt, "\n"  # print tag information to certain file
                title_node = trial_title_node
                next_block = block + 1
                break
            block = block + 1
        except IndexError:
            page += 1

    # find subtitle - note that a subtitle is option - start on the 2nd page and second block on the page
    # WRONG SECOND PAGE IS TABLE OF CONTENt.
    page = 1
    block = next_block
    print_log("next block is:  " + str(block))
    subtitle_node = None
    while (page < 2):
        try:
            trial_subtitle_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]

            if Verbose_flag:
                print "trial_subtitle_node:"
                print trial_subtitle_node

# the Subtitle is assumed to be larger than 19 points
            subtitle_headers = trial_subtitle_node.xpath(
                ".//TOKEN[(@font-size < {0} and @bold = 'no' and @italic= 'no') or (@font-size > {1} and @bold = 'no' and @italic= 'yes')]"
                .format(20, 13))
            if Verbose_flag:
                print "subtitle_headers:"
                print subtitle_headers
            subtitle_path = '../../../../output/parse_result/' + directiory + '/subtitle.txt'
            title_path = '../../../../output/parse_result/' + directiory + '/title.txt'

            subtitle_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in subtitle_headers
            ])
            if len(subtitle_head_txt) and not subtitle_head_txt.isdigit():
                if title_head_txt == "Project proposal":
                    subtitle_path = title_path
                    print "Subtitle: not found"
                    print "Title: found since title is project proporsal, replace subtitle as title"
                txt = subtitle_head_txt
                st = 'subtitle'
                json_append(st, txt)
                # with open(subtitle_path, 'w') as f:
                #   print txt+ "\n"  # print tag information to certain file
                #
                #   print >> f, txt, "\n"  # print tag information to certain file
                subtitle_node = trial_subtitle_node
                next_block = block + 1
                print "Subtitle: found"
                break

            block = block + 1

        except IndexError:
            page += 1

    # find author - on cover page
    Found_Author = False
    Found_Level = False
    author_path = '../../../../output/parse_result/' + directiory + '/author_detail.txt'
    frontname_path = '../../../../output/parse_result/' + directiory + '/front_name.txt'
    aftername_path = '../../../../output/parse_result/' + directiory + '/after_name.txt'
    page = 1
    block = next_block
    auth_node = None
    auth_count = 0
    while (page < 2):
        try:
            trial_auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
            if Verbose_flag:
                print "trial_auth_node:"
                print trial_auth_node

# the author's name(s) is(are) assumed to be smaller than title   bigger than   degree project...
            auth_headers = trial_auth_node.xpath(
                ".//TOKEN[@font-size < {0}  and @font-size > {1}]".format(
                    20, 11))
            if Verbose_flag:
                print "auth_headers:"
                print auth_headers
            print_log(document_type)
            auth_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in auth_headers
            ])
            auth_list = auth_head_txt.split(";")

            while (len(auth_head_txt) > 0) and auth_count < 2 and len(
                    auth_list) > auth_count:  #found
                print "Author: found"
                auth_head_txt = auth_list[auth_count - 1]
                auth_count += 1

                name_split = auth_head_txt.split()
                txt = auth_head_txt
                author = author + "_" + auth_head_txt
                author_path = '../../../../output/parse_result/' + directiory + '/author_' + str(
                    auth_count) + '.txt'
                st = 'author_' + str(auth_count)
                json_append(st, txt)
                # with open(author_path, 'w') as f:
                #     print txt + "in" + author_path
                #     print >> f, txt, "\n"  # print tag information to certain file
                txt = name_split[0]

                frontname_path = '../../../../output/parse_result/' + directiory + '/author_' + str(
                    auth_count) + '_frontname' + '.txt'
                st = 'author_' + str(auth_count) + '_frontname'
                json_append(st, txt)

                # with open(frontname_path, 'w') as f:
                #     print txt + "in" + frontname_path
                #     print >> f, txt, "\n"  # print tag information to certain file

                txt = name_split[1]

                aftername_path = '../../../../output/parse_result/' + directiory + '/author_' + str(
                    auth_count) + '_aftername' + '.txt'

                st = 'author_' + str(auth_count) + '_aftername'
                json_append(st, txt)

                # with open(aftername_path, 'w') as f:
                #     print txt + "in" + aftername_path
                #     print >> f, txt, "\n"  # print tag information to certain file
                auth_node = trial_auth_node

            block = block + 1
        except IndexError:
            page += 1

    font_sizes = tree.xpath('//TOKEN/@font-size')
    mean_font_size = mean(font_sizes)
    median_font_size = median(font_sizes)

    #    print "Median Font Size (i.e. body text):", median_font_size

    font_colors = tree.xpath('//TOKEN/@font-color')
    font_color_hash = {}
    for fc in font_colors:
        try:
            font_color_hash[fc] += 1
        except KeyError:
            font_color_hash[fc] = 1

    sortlist = [(v, k) for k, v in font_color_hash.iteritems()]
    sortlist.sort(reverse=True)
    main_font_color = sortlist[0][1]
    head_txts = []
    stop = False

    page = 0
    Found_abstract = False
    Found_org = False
    Found_key = False
    Found_Sammanfattning = False
    Found_Method = False
    Found_Introduction = False
    Found_TOC = False
    OrgandSup_path = '../../../../output/parse_result/' + directiory + '/Orignization_supervisor(en).txt'
    key_path = '../../../../output/parse_result/' + directiory + '/Keyword(en).txt'
    abstractOut_path = '../../../../output/parse_result/' + directiory + '/abstract(en).txt'
    abstractsvOut_path = '../../../../output/parse_result/' + directiory + '/abstract(sv).txt'
    referat_path = '../../../../output/parse_result/' + directiory + '/referat(sv).txt'
    methodOut_path = '../../../../output/parse_result/' + directiory + '/method(en).txt'
    toc_path = '../../../../output/parse_result/' + directiory + '/toc(en).txt'
    introductionOut_path = '../../../../output/parse_result/' + directiory + '/introduction(en).txt'
    heading_path = '../../../../output/parse_result/' + directiory + '/heading.txt'
    title_path = '../../../../output/parse_result/' + directiory + '/title.txt'

    #page node
    for page_node in tree.xpath('//PAGE'):
        page = page + 1
        block_number = 0
        for block_node in page_node.xpath('.//BLOCK'):
            block_number = block_number + 1
            if xmltag:
                #specify data mining model
                #all gone to heading....not working!!

                if block_node == title_node:  #found title
                    st = "title"
                    et = "title"
                if block_node == subtitle_node:  #found subtitle
                    st = "subtitle"
                    et = "subtitle"
                elif block_node == auth_node:  #found author #not working
                    st = "author"
                    et = "author"
                else:
                    st = "heading"
                    et = "heading"  #found other headings

                if highlight:
                    st = "\033[0;32m{0}\033[0m".format(st)
                    et = "\033[0;32m{0}\033[0m".format(et)
            else:
                st = et = ""
            if block_node == title_node and authonly:
                continue
# note that the assumption that the Abstract headings is set in a larger font then the median font sized used on a page, will not find
# abstracts of Aalto university - as they set the word ABSTRACT in a slightly larger size font as used for the rest of the text, but they do set it in all CAPs
            if look_for_all_caps_headings:
                headers = block_node.xpath(
                    ".//TOKEN[(@font-size > {0} and @bold = 'yes') or @font-color != '{1}']"
                    .format(mean_font_size, main_font_color))
            else:
                headers = block_node.xpath(
                    ".//TOKEN[(@font-size > {0} and @bold = 'yes') or @font-color != '{1}']"
                    .format(mean_font_size * 1.05, main_font_color))
            level_headers = block_node.xpath(
                ".//TOKEN[@font-size > {0}]".format(0))

            head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in headers
            ])
            level_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in level_headers
            ])

            # print head_txt
            if head_txt in text_start_to_exclude:
                start_to_exclude = True
            head_txt = filter_headings(head_txt)

            if len(head_txt) and (not start_to_exclude):
                head_txts.append("{0}{1}{2}".format(
                    st, head_txt, et))  #append st tag_content andet

                # model for proposal
            if (int(document_type) == 1):
                print_log("first content check: " + head_txt)
                if head_txt.find("Authors") >= 0 or head_txt.find(
                        "Author") >= 0:
                    if not Found_Author:  # if the abstract has not been found yet
                        print "Authors(en): OVERIDE "
                        print "Authors and detail information (en): found "
                        author = ""
                        output_text_on_block_on_page(page_node, block_number,
                                                     page, author_path)
                        author = auth
                        Found_Author = True

                if level_head_txt.find("Bachelor") >= 0 or level_head_txt.find(
                        "Master") >= 0 or level_head_txt.find(
                            "Degree Project") >= 0:
                    if not Found_Level:  # if the abstract has not been found yet
                        print_log("Level: found")
                        level_path = '../../../../output/parse_result/' + directiory + '/level.txt'
                        st = 'level'
                        json_append(st, level_head_txt)
                        # with open(level_path, 'w') as f:
                        #     print level_head_txt + "\n"  # print tag information to certain file
                        #     print >> f, level_head_txt, "\n"  # print tag information to certain file

                        Found_Level = True

                if head_txt.find("Organization and Supervisor") >= 0 or (
                        head_txt.find("Organization") >= 0
                        and head_txt.find("Supervisor") >= 0):
                    if not Found_org:  # if the abstract has not been found yet
                        print "Organization and Supervisor (en): found"
                        output_blocks_on_page(page_node, block_number, page,
                                              OrgandSup_path, 0)
                        Found_org = True

                if head_txt.find("Keywords") >= 0 or head_txt.find(
                        "Keyword") >= 0:
                    print_log("I should be herer!!!!!")
                    if not Found_key:  # if the abstract has not been found yet
                        print "Keywords(en): found"
                        output_blocks_on_page(page_node, block_number, page,
                                              key_path, 0)
                        Found_key = True

                # model for thesis
            if head_txt.find("Abstract") >= 0 or head_txt.find(
                    "ABSTRACT") >= 0:
                if not Found_abstract:  #if the abstract has not been found yet
                    print "Abstract (en): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          abstractOut_path, 0)
                    Found_abstract = True
                break

            if head_txt.find("Sammanfattning") >= 0 or head_txt.find(
                    "SAMMANFATTNING") >= 0:
                if not Found_Sammanfattning:
                    print "Sammanfattning (sv): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          abstractsvOut_path, 0)
                    Found_Sammanfattning = True
                break

            if head_txt.find("Abstrakt") >= 0 or head_txt.find(
                    "ABSTRAKT") >= 0:
                if not Found_Sammanfattning:
                    print "Abstrakt (sv): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          abstractOut_path, 0)
                    Found_Sammanfattning = True
                break

            if head_txt.find("Referat") >= 0 or head_txt.find("REFERAT") >= 0:
                if not Found_Sammanfattning:
                    print "Referat (sv): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          referat_path, 0)
                    Found_Sammanfattning = True
                break
                #table of content
            if head_txt.find("Table of Contents") >= 0 or head_txt.find(
                    "Contents") >= 0:
                if not Found_TOC:  # if the abstract has not been found yet
                    print "TOC (en): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          toc_path, 0)
                    Found_TOC = True
                break

            if head_txt.find("Introduction") >= 0 or head_txt.find(
                    "INTRODUCTION") >= 0:
                if not Found_Introduction:  # if the abstract has not been found yet
                    print "Introduction (en): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          introductionOut_path, 1)
                    Found_Introduction = True

                    #Found_Introduction = True
                break

            if head_txt.find("Methods") >= 0 or head_txt.find(
                    "METHODS") >= 0 or head_txt.find(
                        "Methodology") >= 0 or head_txt.find(
                            "METHODOLOGY") >= 0:
                if not Found_Method:  #if the abstract has not been found yet
                    print "Methods (en): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          methodOut_path, 0)
                    Found_Method = True
                break


#
#            if head_txt.find("Abstracto(sp)") >= 0:
#                    print "Abstracto (sp):"
#                    output_blocks_on_page(page_node, block_number, page)
#                break
#
#            if head_txt.find("Abstrait (fr)") >= 0:
#                    print "Abstrait (fr):"
#                    output_blocks_on_page(page_node, block_number, page)
#                break

            if block_node == title_node and titleonly:
                stop = True
                break
            elif block_node == auth_node and authonly:
                stop = True
                break
        if stop:
            break
Exemple #35
0
def pdf2heads(opts, args):
    xmltag = True
    highlight = False
    titleonly = False
    authonly = False
    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
        if (o == '--title'):
            titleonly = True
        elif (o == '--author'):
            authonly = True

        tree = pdf2etree(args)

    # find title
    page = 1
    block = 1
    title_node = None
    while True:
        try:
            title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
        except IndexError:
            page += 1
        else:
            break
        if page > 2:
            # probably not going to find it now
            break

    # find author
    page = 1
    block = 2
    auth_node = None
    while True:
        try:
            auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
        except InbdexError:
            block += 1
        else:
            break
        if block > 4:
            # probably not going to find it now
            break

    font_sizes = tree.xpath('//TOKEN/@font-size')
    mean_font_size = mean(font_sizes)
    median_font_size = median(font_sizes)

    #print "Median Font Size (i.e. body text):", median_font_size

    font_colors = tree.xpath('//TOKEN/@font-color')
    font_color_hash = {}
    for fc in font_colors:
        try:
            font_color_hash[fc] += 1
        except KeyError:
            font_color_hash[fc] = 1

    sortlist = [(v, k) for k, v in font_color_hash.iteritems()]
    sortlist.sort(reverse=True)
    main_font_color = sortlist[0][1]
    head_txts = []
    stop = False
    for page_node in tree.xpath('//PAGE'):
        for block_node in page_node.xpath('.//BLOCK'):
            if xmltag:
                if block_node == title_node:
                    st = "<title>"
                    et = "</title>"
                elif block_node == auth_node:
                    st = "<author>"
                    et = "</author>"
                else:
                    st = "<heading>"
                    et = "</heading>"

                if highlight:
                    st = "\033[0;32m{0}\033[0m".format(st)
                    et = "\033[0;32m{0}\033[0m".format(et)
            else:
                st = et = ""
            if block_node == title_node and authonly:
                continue
            headers = block_node.xpath(
                ".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']"
                .format(mean_font_size * 1.05, main_font_color))
            head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in headers
            ])
            if len(head_txt):
                head_txts.append("{0}{1}{2}".format(st, head_txt, et))

            if block_node == title_node and titleonly:
                stop = True
                break
            elif block_node == auth_node and authonly:
                stop = True
                break
        if stop:
            break
    for txt in head_txts:
        sys.stdout.writelines([txt, '\n'])