def _papers_citations_number_by_year_sframe(without_self_citation=True):
    """
    Get papers total number of citation in each year
    :param without_self_citation: if True calculate only non-self citations, other calculate with self-citations
    :return: SFrame with a column that contains citations_dict by year
    """
    logger.info("Creating Paper Citations by Year (without_self_citation=%s)" %
                without_self_citation)
    ref_sf = tc.load_sframe(EXTENDED_PAPER_REFERENCES_SFRAME)
    if without_self_citation:
        ref_sf = ref_sf[ref_sf['self citation'] == 0]

    sf = tc.load_sframe(PAPERS_SFRAME)["Paper ID", "Paper publish year"]
    sf = ref_sf.join(sf, on="Paper ID")
    g = sf.groupby(["Paper reference ID", "Paper publish year"],
                   {"Citation Number": agg.COUNT()})
    g = g.rename({
        "Paper publish year": "Year",
        "Paper reference ID": "Paper ID"
    })
    g['Citation by Year'] = g.apply(lambda r:
                                    (r["Year"], r["Citation Number"]))
    h = g.groupby(
        'Paper ID',
        {'Citation by Years': tc.aggregate.CONCAT('Citation by Year')})
    if without_self_citation:
        h['Total Citations by Year without Self Citations'] = h[
            'Citation by Years'].apply(
                lambda l: _get_total_citation_by_year(l))
    else:
        h['Total Citations by Year'] = h['Citation by Years'].apply(
            lambda l: _get_total_citation_by_year(l))
    h = h.remove_column("Citation by Years")
    return h
def create_aminer_mag_links_by_doi_sframe():
    """
    Create Links Sframe that match papers from the MAG dataset with papers from the AMiner dataset based on the papers
    DOI
    :return:
    """
    if os.path.isdir(AMINER_MAG_JOIN_SFRAME):
        return
    sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME)
    g1 = sf.groupby('Paper Document Object Identifier (DOI)',
                    {'Count': agg.COUNT()})
    s1 = set(g1[g1['Count'] > 1]['Paper Document Object Identifier (DOI)'])
    sf = sf[sf['Paper Document Object Identifier (DOI)'].apply(
        lambda doi: doi not in s1)]
    sf.materialize()

    sf2 = tc.load_sframe(AMINER_PAPERS_SFRAME)
    g2 = sf2.groupby('doi', {'Count': agg.COUNT()})
    s2 = set(g2[g2['Count'] > 1]['doi'])
    sf2 = sf2[sf2['doi'].apply(lambda doi: doi not in s2)]
    sf2.materialize()

    j = sf.join(sf2, {'Paper Document Object Identifier (DOI)': 'doi'})
    j['title_len'] = j['title'].apply(lambda t: len(t))
    j['title_len2'] = j['Original paper title'].apply(lambda t: len(t))
    j = j[j['title_len'] > 0]
    j = j[j['title_len2'] > 0]

    j = j.rename({"Paper ID": "MAG Paper ID", "id": "Aminer Paper ID"})
    j = j.remove_columns(['title_len', 'title_len2'])
    j.save(AMINER_MAG_JOIN_SFRAME)
Ejemplo n.º 3
0
 def coauthors_links_sframe(self):
     if self._co_authors_links is not None:
         return self._co_authors_links
     if os.path.isdir(CO_AUTHORSHIP_LINK_SFRAME):
         self._co_authors_links = tc.load_sframe(CO_AUTHORSHIP_LINK_SFRAME)
     else:
         self._co_authors_links = tc.load_sframe(
             CO_AUTHORSHIP_LINK_S3_SFRAME)
     return self._co_authors_links
Ejemplo n.º 4
0
 def _get_all_papers_sframe(self):
     """
     Return SFrame with all the papers published in the venue
     :return: Papers SFrame with all the papers details that were published in the venue
     :rtype tc.SFrame
     @note: The SFrame object was created by academic_parser.create_venue_papers_sframe
     """
     if self.venue_type == VenueType.journal:
         return tc.load_sframe(
             "%s/%s.sframe" % (JOURNALS_PAPERS_SFRAMES_DIR, self._venue_id))
     elif self.venue_type == VenueType.conference:
         return tc.load_sframe(
             "%s/%s.sframe" %
             (CONFERENCES_PAPERS_SFRAMES_DIR, self._venue_id))
Ejemplo n.º 5
0
def classify_page():
    #try:
        data_id = request.args.get('data_id')
        my_data = UserData.query.filter_by(id=data_id).first()
        my_model = TrainedModel()
        form = TrainModelForm(request.form, obj=my_model)
        data_frame = tc.load_sframe(my_data.sname)
        data_frame = tc.load_sframe(my_data.sname)
        target = None
        cols = []
        display_cols = []
        names=data_frame.column_names()
        types=data_frame.column_types()

        for x in range(0, names.__len__()):
            cols.append(str(names[x]))

        if request.method == 'POST':
            target = request.form['target']
            data_frame = data_frame.dropna(str(target), how="all")
            orig_data = data_frame[str(target)]
            norig_data = orig_data.to_numpy() 
            classes = []
            for data in norig_data:
                appended = False 
                for x in range(1, int(request.form['num_brackets'])+1):
                    if float(data) >= float(request.form['lrange_' + str(x)]) and float(data) <= float(request.form['urange_' + str(x)]):
                        print(request.form['class_' + str(x)]) 
                        classes.append(request.form['class_' + str(x)])
                        appended = True
                        continue 
                if appended == False:
                    classes.append("unknown")   

            data_frame = safely_add_col(str(request.form['field']), classes, data_frame)            
            fwd_id = save_data(my_data, request.form['name'], data_frame)
  
            flash('Successfully transformed the data set!', 'success')
            return redirect(url_for('data.data_details_page', data_id=fwd_id))

        return render_template('pages/data/transforms/classifier.html',
            my_data=my_data,
            form=form,
            data_frame=data_frame,
            names=names,
            types=types,
            target=target,
            cols=cols)
 def test_real_data(self):
     """
     This test is excluded from regular build process. Only used for manual verification.
     """
     train_path = _os.path.join(_lfs, 'gl-internal', 'internal-testdata',
                                'traindata3916.sframe')
     test_path = _os.path.join(_lfs, 'gl-internal', 'internal-testdata',
                               'testdata3916.sframe')
     train_data = tc.load_sframe(train_path)
     test_data = tc.load_sframe(test_path)
     m = tc.boosted_trees_regression.create(train_data,
                                            target='is_cv',
                                            max_iterations=1,
                                            validation_set=None,
                                            max_depth=9)
     self._check_json_model_predict_consistency(m, test_data)
Ejemplo n.º 7
0
def split_page():
    try:
        data_id = request.args.get('data_id')
        my_data = UserData.query.filter_by(id=data_id).first()
        my_model = TrainedModel()
        form = TrainModelForm(request.form, obj=my_model)
        data_frame = tc.load_sframe(my_data.sname)

        if request.method == 'POST':
            training_set,test_set = data_frame.random_split(float(request.form['percent']),seed=0)
            save_data(my_data, request.form['train'], training_set)
            save_data(my_data, request.form['test'], test_set)

            flash('Successfully created train/test split for ' + my_data.name + '!', 'success')
            return redirect(url_for('main.my_project_page', project_id=my_data.project_id))

        return render_template('pages/data/transforms/split.html',
            my_data=my_data,
            form=form,
            data_frame=data_frame)
    except Exception as e:
        flash('Opps!  Something unexpected happened.  On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error')
        error = ErrorLog()
        error.user_id = current_user.id
        error.error = str(e.__class__)
        error.parameters = request.args
        db.session.add(error)
        db.session.commit()
        return redirect(request.referrer)
Ejemplo n.º 8
0
    def authors_features(self):
        """
        Create Authors SFrame in which each row is unique AuthorId and the author's various features
        :return: SFrame with Authors features
        :rtype: tc. SFrame
        """
        p_sf = self._p_sf[['PaperId']]  # 22082741
        a_sf = self._mag.paper_author_affiliations["AuthorId", "PaperId"]
        a_sf = a_sf.join(p_sf, on="PaperId")
        a_sf = a_sf[["AuthorId"]].unique()
        g = self.get_authors_papers_dict_sframe()
        a_sf = a_sf.join(g, on="AuthorId", how="left")  # 22443094 rows
        g = self.get_co_authors_dict_sframe()
        a_sf = a_sf.join(g, on="AuthorId", how='left')
        author_names = self._mag.author_names
        author_names["First Name"] = author_names["NormalizedName"].apply(
            lambda x: x.split(" ")[0])
        a_sf = a_sf.join(author_names, on="AuthorId", how="left")
        g_sf = tc.load_sframe(str(FIRST_NAMES_SFRAME))
        a_sf = a_sf.join(g_sf, on={"First Name": "First Name"}, how="left")

        feature_names = [("AffiliationId", "Affilation by Year Dict"),
                         ('AuthorSequenceNumber',
                          'Sequence Number by Year Dict'),
                         ("ConferenceSeriesId", "Conference ID by Year Dict"),
                         ("JournalId", "Journal ID by Year Dict"),
                         ("OriginalVenue", "Venue by Year Dict")]
        for fname, col_name in tqdm(feature_names):
            f_sf = self._get_author_feature_by_year_sframe(fname, col_name)
            a_sf = a_sf.join(f_sf, on="AuthorId", how='left')

        return a_sf
Ejemplo n.º 9
0
def remove_columns_page():
    try:
        data_id = request.args.get('data_id')
        my_data = UserData.query.filter_by(id=data_id).first()
        my_model = TrainedModel()
        form = TrainModelForm(request.form, obj=my_model)
        data_frame = tc.load_sframe(my_data.sname)

        if request.method == 'POST':
            features_utf = request.form.getlist('features')
            features_str = []

            for feat in features_utf:
                features_str.append(str(feat))
            sframe = data_frame.remove_columns(features_str)
            fwd_id = save_data(my_data, request.form['name'], sframe)

            flash('Data transform is sucessful!', 'success')
            return redirect(url_for('data.data_details_page', data_id=fwd_id))
        return render_template('pages/data/transforms/remove_columns.html',
            my_data=my_data,
            form=form,
            data_frame=data_frame,
            names=data_frame.column_names(),
            types=data_frame.column_types())
    except Exception as e:
        flash('Opps!  Something unexpected happened.  On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error')
        error = ErrorLog()
        error.user_id = current_user.id
        error.error = str(e.__class__)
        error.parameters = request.args
        db.session.add(error)
        db.session.commit()
        return redirect(request.referrer)
Ejemplo n.º 10
0
def unique_page():
    try:
        data_id = request.args.get('data_id')
        my_data = UserData.query.filter_by(id=data_id).first()
        my_model = TrainedModel()
        form = TrainModelForm(request.form, obj=my_model)
        data_frame = tc.load_sframe(my_data.sname)

        if request.method == 'POST':
            new_id = str(request.form['new_id'])
            name = str(request.form['name'])
            sf = data_frame.add_row_number(new_id)
            fwd_id = save_data(my_data, name, sf)
            flash('Successfully transformed the data!', 'success')
            return redirect(url_for('data.data_details_page', data_id=fwd_id))
        return render_template('pages/data/transforms/unique.html',
            my_data=my_data,
            data_frame=data_frame,
            form=form)
    except Exception as e:
        flash('Opps!  Something unexpected happened.  On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error')
        error = ErrorLog()
        error.user_id = current_user.id
        error.error = str(e.__class__)
        error.parameters = request.args
        db.session.add(error)
        db.session.commit()
        return redirect(request.referrer)
Ejemplo n.º 11
0
def web_api_page():
    try:
        model_id = request.args.get('model_id')
        my_model = TrainedModel.query.filter_by(id=model_id).first()
        my_data = UserData.query.filter_by(id=my_model.data_id).first()
        if my_data.user_id is not current_user.id:
            flash('Opps!  Do data found', 'error')
            return redirect(request.referrer)
        data_frame = tc.load_sframe(my_data.sname)

        names=data_frame.column_names()
        types=data_frame.column_types()
        type_map = {}
        for x in range(0, names.__len__()):
            type_map[str(names[x])] = types[x]

        example_json = {}
        for feature in my_model.features['features']:
            example_json[feature] = type_map[feature].__name__

        return render_template('pages/models/web_api.html',
            my_data=my_data,
            type_map=type_map,
            example_json=json.dumps(example_json, sort_keys = True, indent = 4, separators = (',', ': ')),
            my_model=my_model)
    except Exception as e:
        flash('Opps!  Something unexpected happened.  On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error')
        error = ErrorLog()
        error.user_id = current_user.id
        error.error = str(e.__class__)
        error.parameters = request.args
        db.session.add(error)
        db.session.commit()
        return redirect(request.referrer)
Ejemplo n.º 12
0
 def __init__(self):
     self.imgframe = tc.load_sframe('model/final/final.sframe')
     self.model = tc.load_model('model/final/final_model')
     self.sample = tc.Image()
     self.results = SFrame()
     self.rows = SArray()
     self.pathlist = []
     self.distance_list = []
def create_references_count_sframe():
    """Creating SFrame with the number of references in each paper"""
    logger.info("Creating References Count SFrame")
    if os.path.isdir(PAPER_REFERENCES_COUNT_SFRAME):
        return
    r_sf = tc.load_sframe(PAPER_REFERENCES_SFRAME)
    sf = r_sf.groupby("Paper ID", {"Ref Number": agg.COUNT()})
    sf.save(PAPER_REFERENCES_COUNT_SFRAME)
Ejemplo n.º 14
0
    def get_valid_venues_papers_ids_sframe(min_ref_number,
                                           min_journal_papers_num):

        # Criteria I: we use only journals that have paper with valid DOI that appears in both AMiner and MAG datasets
        sf = tc.load_sframe(str(AMINER_MAG_JOIN_SFRAME))
        sf['Original venue name'] = sf['Original venue name'].apply(
            lambda n: n.lower())
        g = sf.groupby(
            'Journal ID mapped to venue name', {
                'venue name': agg.CONCAT('Original venue name'),
                'issn': agg.CONCAT('issn')
            })

        g['issn'] = g['issn'].apply(lambda l: list(set(l)))
        g['venue name'] = g['venue name'].apply(lambda l: list(set(l)))

        # Criteria II:  the journal as only signle name
        g = g[g['venue name'].apply(lambda l: len(l) == 1)]
        g.materialize()
        g['venue name'] = g['venue name'].apply(lambda l: l[0].strip())

        # Criteria III:  the journal's name appears in SJR
        sjr_dict = VenueFetcher.get_sjr_journals_dict()
        g = g[g['venue name'].apply(lambda v: v in sjr_dict)]

        venues_ids = set(g['Journal ID mapped to venue name'])

        # Criteria IV: Each venue need to have at least min_journal_papers_num papers with at
        # least min_ref_number refs in each paper
        dataset_dir = pathlib.Path(STORAGE_PATH)
        mag_path = dataset_dir / "MAG"
        mag = MicrosoftAcademicGraph(mag_path)

        sf = mag.extended_papers['Journal ID mapped to venue name',
                                 'Original venue name', 'Paper ID',
                                 'Ref Number']
        sf = sf[sf['Ref Number'] >= min_ref_number]
        sf.materialize()
        sf = sf[sf['Journal ID mapped to venue name'].apply(
            lambda i: i in venues_ids)]
        sf['Journal name'] = sf['Original venue name'].apply(
            lambda n: n.lower().strip())
        sf.materialize()
        # Notice that with the full Papers SFrmae journal can have several names
        g = sf.groupby(
            ['Journal ID mapped to venue name'], {
                'Count': agg.COUNT(),
                'Paper IDs List': agg.CONCAT("Paper ID"),
                'Journals names': agg.CONCAT('Journal name')
            })
        g['Journals names'] = g['Journals names'].apply(lambda l: list(set(l)))
        g = g[g['Count'] >= min_journal_papers_num]
        g = g[g['Journals names'].apply(lambda l: len(l) == 1)]
        g['Journals names'] = g['Journals names'].apply(lambda l: l[0])
        g = g.rename({'Journals names': 'Journal name'})
        g.materialize()

        return g
Ejemplo n.º 15
0
def predictions_step1_page():
    # try:
        tc.config.set_num_gpus(0)
        model_id = request.args.get('model_id')
        my_model = TrainedModel.query.filter_by(id=model_id).first()
        my_data = UserData.query.filter_by(project_id=my_model.project_id).all()
        if my_data[0].user_id is not current_user.id:
            flash('Opps!  Do data found', 'error')
            return redirect(request.referrer)

        form = UserProfileForm(request.form, obj=current_user)
        if request.method == 'POST':
            data_id = request.form['data_set_id']
            my_data = UserData.query.filter_by(id=data_id).first()
            data_frame = tc.load_sframe(my_data.sname)
            if my_model.features['model_type'] == 'deep':
                tfrm = data_frame.to_dataframe()
                tfrm = tfrm.sort_values(by=[my_model.features["session_id"], my_model.features["time_field"]])
                data_frame = tc.SFrame(data=tfrm)
                data_frame[str(my_model.features["session_id"])] = data_frame[str(my_model.features["session_id"])].astype(int)

            model = tc.load_model(my_model.mname)
            predictions = model.predict(data_frame).to_numpy()

            my_dict = Predictions()
            my_dict.model_id = my_model.id
            my_dict.user_id = current_user.id
            my_dict.path = my_model.path
            my_dict.input_file = my_data.name
            my_predictions = []
            for item in predictions:
                my_predictions.append(str(item))
            my_dict.predictions = my_predictions
            origs = []
            for item in data_frame[str(my_model.features['target'])]:
                origs.append(str(item))

            # Make sure the predictions only overwrite blank values
            if request.form['mode'] == "fill":
                size = len(predictions)
                for x in range(0, size):
                    if origs[x] is not None:
                        predictions[x] = origs[x]

            my_dict.originals = origs
            data_frame = safely_add_col('Predicted_Value', predictions, data_frame)
            my_dict.oname = os.path.join(my_dict.path, str(uuid.uuid4())  + "_model_predictions.csv")
            data_frame.save(my_dict.oname, format='csv')
            db.session.add(my_dict)
            db.session.commit()

            # Redirect to home page
            return redirect(url_for('model.prediction_page', dict=my_dict.id))
        return render_template('pages/models/predict_step1.html',
            my_data=my_data,
            my_model=my_model,
            form=form)
def create_aminer_mag_sjr_sframe(year):
    """
    Creates a unified SFrame of AMiner, MAG, and the SJR datasets
    :param year: year to use for SJR data
    :return: SFrame with AMiner, MAG, and SJR data
    :rtype: tc.SFrame
    """
    sf = tc.load_sframe(AMINER_MAG_JOIN_SFRAME)
    sf = sf[sf['issn'] != None]
    sf = sf[sf['issn'] != 'null']
    sf.materialize()
    r = re.compile(r"(\d+)-(\d+)")
    sf['issn_str'] = sf['issn'].apply(lambda i: "".join(r.findall(i)[0])
                                      if len(r.findall(i)) > 0 else None)
    sf = sf[sf['issn_str'] != None]
    sjr_sf = tc.load_sframe(SJR_SFRAME)
    sjr_sf = sjr_sf[sjr_sf['Year'] == year]
    return sf.join(sjr_sf, on={'issn_str': "ISSN"})
Ejemplo n.º 17
0
def recode_step2_page():
    try:
        data_id = request.args.get('data_id')
        target = request.args.get('target')
        name = request.args.get('name')
        my_data = UserData.query.filter_by(id=data_id).first()
        my_model = TrainedModel()
        form = TrainModelForm(request.form, obj=my_model)
        data_frame = tc.load_sframe(my_data.sname)
        names=data_frame.column_names()
        types=data_frame.column_types()

        orig_data = data_frame[str(target)]
        norig_data = orig_data.to_numpy()

        target_data = data_frame[str(target)].unique()
        ntarget_data = target_data.to_numpy()

        if request.method == 'POST':
            mapped_values = []
            data_frame = safely_add_col(str(target) + '_uncoded', data_frame[str(target)], data_frame)
            for x in range(0, ntarget_data.__len__()):
                mapped_values.append(str(request.form['new_value' + str(x)]))
            cross_ref = []
            for x in range(0, names.__len__()):
                if (str(types[x].__name__) == "str"):
                    cross_ref.append(str(names[x]))
            new_data = []
            for field in norig_data:
                for y in range(0, ntarget_data.__len__()):
                    if str(ntarget_data[y]) == str(field):
                        new_data.append(int(mapped_values[y]))
            sa = SArray(new_data)
            data_frame[str(target)] = sa
            fwd_id = save_data(my_data, name, data_frame)

            flash('Successfully re-coded ' + target + '!', 'success')
            return redirect(url_for('data.data_details_page', data_id=fwd_id))
        return render_template('pages/data/transforms/code_field_step2.html',
            my_data=my_data,
            form=form,
            data_frame=data_frame,
            names=names,
            name=name,
            types=types,
            ntarget_data=ntarget_data,
            target=target)
    except Exception as e:
        flash('Opps!  Something unexpected happened.  On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error')
        error = ErrorLog()
        error.user_id = current_user.id
        error.error = str(e.__class__)
        error.parameters = request.args
        db.session.add(error)
        db.session.commit()
        return redirect(request.referrer)
def create_paper_keywords_list_sframe():
    """
    Creating Paper Keywords List SFrame
    """
    logger.info("Creating Papers' Keywords List SFrame")
    if os.path.isdir(PAPER_KEYWORDS_LIST_SFRAME):
        return

    sf = tc.load_sframe(PAPER_KEYWORDS_SFRAME)
    g = sf.groupby("Paper ID", {"Keywords List": agg.CONCAT("Keyword name")})
    g.save(PAPER_KEYWORDS_LIST_SFRAME)
Ejemplo n.º 19
0
def fill_na_page():
    try:
        data_id = request.args.get('data_id')
        my_data = UserData.query.filter_by(id=data_id).first()
        my_model = TrainedModel()
        form = TrainModelForm(request.form, obj=my_model)
        data_frame = tc.load_sframe(my_data.sname)
        names=data_frame.column_names()
        types=data_frame.column_types()

        if request.method == 'POST':
            value = str(request.form['value'])
            name = str(request.form['name'])
            for feature in request.form.getlist('features'):
                orig_data = data_frame[str(feature)]
                print(orig_data.dtype.__name__)
                if orig_data.dtype.__name__ == "int":
                    try:
                        data_frame[str(feature)] = orig_data.fillna(int(value))
                    except Exception as e:
                        flash('Opps!  Looks like you passed something I could not parse as an integer.', 'error')
                        return redirect(request.referrer)
                if orig_data.dtype.__name__ == "float":
                    try:
                        data_frame[str(feature)] = orig_data.fillna(float(value))
                    except Exception as e:
                        flash('Opps!  Looks like you passed something I could not parse as an float.', 'error')
                        return redirect(request.referrer)
                if orig_data.dtype.__name__ == "str":
                    try:
                        data_frame[str(feature)] = orig_data.fillna(str(value))
                    except Exception as e:
                        flash('Opps!  Looks like you passed something I could not parse as an string.', 'error')
                        return redirect(request.referrer)
            fwd_id = save_data(my_data, name, data_frame)
            flash('Successfully replaced N/A values!', 'success')
            return redirect(url_for('data.data_details_page', data_id=fwd_id))

        return render_template('pages/data/transforms/fill_na.html',
            my_data=my_data,
            data_frame=data_frame,
            names=names,
            types=types,
            form=form)
    except Exception as e:
        flash('Opps!  Something unexpected happened.  On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error')
        error = ErrorLog()
        error.user_id = current_user.id
        error.error = str(e.__class__)
        error.parameters = request.args
        db.session.add(error)
        db.session.commit()
        return redirect(request.referrer)
def _create_field_of_study_paper_ids_sframe(level):
    """
    Create SFrame in which each row contains a field of study and it's matching list of paper ids
    :param level: field of study level
    :return: SFrame with the fields of stuyd in the input level papers ids
    :rtype: tc.SFrame
    """
    logger.info("Creating fields os study paper ids SFrame level - %s " %
                level)

    col = 'Fields of study parent list (L%s)' % level
    sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME)
    new_col_name = "Field ID"
    sf = sf.stack(col, new_column_name=new_col_name)
    sf = sf[sf[col] != None]
    g = sf.groupby(new_col_name, {'Paper IDs': agg.CONCAT("Paper ID")})
    f_sf = tc.load_sframe(FIELDS_OF_STUDY_SFRAME)
    g = g.join(f_sf, on={new_col_name: "Field of study ID"})
    g['Number of Paper'] = g['Paper IDs'].apply(lambda l: len(l))
    g['Level'] = level
    g = g.rename({new_col_name: "Field of study ID"})
    return g
def create_extended_papers_sframe():
    """
    Created extended papers SFrame which contains various papers features, such as paper citation numbers, authors list, urls,.. etc
    :return:
    """
    logger.info("Creating Extended Papers SFrame")
    if os.path.isdir(EXTENDED_PAPERS_SFRAME):
        return
    sf = tc.load_sframe(PAPERS_SFRAME)

    sframes_list = [
        PAPER_REFERENCES_COUNT_SFRAME, PAPERS_CITATIONS_BYYEAR_SFRAME,
        PAPERS_ORDERED_AUTHORS_LIST_SFRAME, PAPER_KEYWORDS_LIST_SFRAME,
        PAPERS_FIELDS_OF_STUDY_SFRAME, PAPER_URLS_SFRAME
    ]

    for s in sframes_list:
        t = tc.load_sframe(s)
        sf = sf.join(t, how="left", on="Paper ID")
        sf.save(EXTENDED_PAPERS_SFRAME)
    sf = sf.fillna("Ref Number", 0)
    sf.save(EXTENDED_PAPERS_SFRAME)
def get_papers_sframe(min_ref_num=None, start_year=None, end_year=None):
    """
    Return SFrame with Papers data accoring to the input filter variables
    :param min_ref_num:  paper's minimal references number
    :param start_year: start year (only include paper that were published after start year)
    :param end_year: end year (only include paper that were published before end year)
    :return: SFrame with paper data
    :rtype: tc.SFrame
    :note: after the SFrame is created it is saved to the TMP_DIR to future use
    """
    sf = tc.load_sframe(PAPER_REFERENCES_SFRAME)
    tmp_papers_sf_path = _get_tmp_papers_sframe_path(min_ref_num, start_year,
                                                     end_year)
    if os.path.isdir(tmp_papers_sf_path):
        return tc.load_sframe(tmp_papers_sf_path)

    if min_ref_num is not None:
        logger.info(
            f"Getting papers ids with at least refrences {min_ref_num}")
        sf = sf.groupby(
            'Paper ID',
            {'Ref Count': agg.COUNT()})  # There are 30058322 in the list
        sf = sf[sf['Ref Count'] >= min_ref_num]  # left with 22,083,058
        sf.__materialize__()
    p_sf = tc.load_sframe(PAPERS_SFRAME)
    sf = p_sf.join(sf)
    if start_year is not None:
        logger.info("Getting papers with from %s " % start_year)
        sf = sf[sf['Paper publish year'] >= start_year]
    if end_year is not None:
        logger.info("Getting papers with util %s " % end_year)
        sf = sf[sf['Paper publish year'] <= end_year]
    sf.__materialize__()

    if not os.path.isdir(tmp_papers_sf_path):
        sf.save(tmp_papers_sf_path)

    return sf
def create_extended_references_sframe():
    """
    Create SFrame with references data with additional column that state if the reference is self-citation
    """
    logger.info("Creating Extended References  SFrame")
    if os.path.isdir(EXTENDED_PAPER_REFERENCES_SFRAME):
        return
    ref_sf = tc.load_sframe(PAPER_REFERENCES_SFRAME)
    p_sf = tc.load_sframe(PAPERS_ORDERED_AUTHORS_LIST_SFRAME)
    ref_sf = ref_sf.join(p_sf, on='Paper ID', how="left")
    ref_sf = ref_sf.join(p_sf,
                         on={'Paper reference ID': 'Paper ID'},
                         how="left")
    ref_sf = ref_sf.fillna('Authors List Sorted.1', [])
    ref_sf = ref_sf.fillna('Authors List Sorted', [])
    ref_sf.__materialize__()
    ref_sf['self citation'] = ref_sf.apply(lambda r: len(
        set(r['Authors List Sorted.1']) & set(r['Authors List Sorted'])))
    ref_sf.__materialize__()
    ref_sf = ref_sf.remove_columns(
        ['Authors List Sorted.1', 'Authors List Sorted'])

    ref_sf.save(EXTENDED_PAPER_REFERENCES_SFRAME)
Ejemplo n.º 24
0
 def wrapper_repeat(self, *args, **kwargs):               
     sframe_path = pathlib.Path(self._sframe_dir).joinpath(sframe)
     if not sframe_path.exists():
         table_name = sframe.split(".")[0]
         if table_name in MAG_URL_DICT:
             url = MAG_URL_DICT[table_name]
             mag_file = self._dataset_dir / re.search(".*files\/(.*?)\?", url).group(1)
             if not pathlib.Path(mag_file).exists():
                 download_file(url, mag_file)
         
         value = func(self, *args, **kwargs)
         value.save(str(sframe_path))
     else:
         value = load_sframe(str(sframe_path))
     return value
Ejemplo n.º 25
0
def load_functions_partition(directory, name):
    if name is None:
        name = ''
    logging.info(f"Loading functions from {directory}{name}")
    mw = tc.load_sframe(f"{directory}{name}")
    if 'fcount' in mw.column_names():
        mw.remove_column('fcount', inplace=True)

    if 'hapk' in mw.column_names():
        mw.rename(names={'hapk': 'apk'}, inplace=True)

    if 'hfunc' in mw.column_names():
        mw.rename(names={'hfunc': 'function'}, inplace=True)

    return mw
Ejemplo n.º 26
0
def create_mod(path):
    if os.path.isdir(path + '/data.sframe'):
        print('reference_data is existed')
        reference_data = tc.load_sframe(path + '/data.sframe')
    else:
        # Load images from the downloaded data
        reference_data = tc.image_analysis.load_images(path)
        reference_data = reference_data.add_row_number()
        reference_data.save(path + '/data.sframe')
    if os.path.isdir(path + '/savedmodel.model'):
        print('mod is existed')
        model = tc.load_model(path + '/savedmodel.model')
    else:
        # Save the SFrame for future use
        model = tc.image_similarity.create(reference_data)
        model.save(path + '/savedmodel.model')
    return reference_data, model
Ejemplo n.º 27
0
def outlliers_page():
    try:
        data_id = request.args.get('data_id')
        my_data = UserData.query.filter_by(id=data_id).first()
        my_model = TrainedModel()
        form = TrainModelForm(request.form, obj=my_model)
        data_frame = tc.load_sframe(my_data.sname)
        names=data_frame.column_names()
        types=data_frame.column_types()

        if request.method == 'POST':
            cent = float(request.form['cent'])
            name = str(request.form['name'])
            target = str(request.form['target'])
            mean = data_frame[target].mean()
            rows = []
            for row in data_frame:
                if row[target] is not None:
                    diff = abs(float(row[target]) - mean)
                    pdiff = diff/mean
                    if pdiff < cent:
                        rows.append(row)
                else:
                    rows.append(row)
            sf = tc.SFrame(rows)
            sf = sf.unpack('X1', column_name_prefix='')
            print(sf)
            fwd_id = save_data(my_data, name, sf)
            flash('Successfully removed outliers!', 'success')
            return redirect(url_for('data.data_details_page', data_id=fwd_id))
        return render_template('pages/data/transforms/outlier.html',
            my_data=my_data,
            data_frame=data_frame,
            names=names,
            types=types,
            form=form)
    except Exception as e:
        flash('Opps!  Something unexpected happened.  On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error')
        error = ErrorLog()
        error.user_id = current_user.id
        error.error = str(e.__class__)
        error.parameters = request.args
        db.session.add(error)
        db.session.commit()
        return redirect(request.referrer)
Ejemplo n.º 28
0
def extract_imgs_from_sframe(sframe,
                             target_label='mainPlate',
                             buffer=64,
                             draw_center=False,
                             draw_center_line=False,
                             draw_boundings=False,
                             draw_masks=False,
                             draw_frame_num=True,
                             annotations_col='annotations',
                             image_col='image',
                             masks_col='stateMasks'):
    sf = list(tc.load_sframe(sframe))

    frames = []
    frame_num = 0
    centers = {}

    for x in tqdm(sf, desc='Parsing'):
        img = x[image_col].pixel_data
        append_centers(x[annotations_col],
                       centers,
                       buffer=buffer,
                       target_label=target_label)

        if draw_boundings:
            img = tc.object_detector.util.draw_bounding_boxes(
                get_tc_img(img), x[annotations_col]).pixel_data

        if draw_masks:
            img = draw_mask_data(img, x[masks_col])

        if draw_center:
            img = draw_nearest_centers(img, centers)

        if draw_center_line:
            img = draw_center_lines(img, centers, buffer=buffer)

        if draw_frame_num:
            img = draw_text(img, str(frame_num))

        frames.append(img)
        frame_num += 1

    return frames
Ejemplo n.º 29
0
def get_accuracy(path='.'):
    reference_data = tc.load_sframe(path + '/data.sframe')
    if os.path.isdir(path + '/savedmodel.model'):
        model = tc.load_model(path + '/savedmodel.model')
    else:
        model = tc.image_similarity.create(reference_data)
        model.save(path + '/savedmodel.model')
    correct = 0
    mistake = 0
    index = 0
    distance = 0
    while index < len(reference_data):
        if index + step_length < len(reference_data):
            query_results = model.query(reference_data[index:index +
                                                       step_length],
                                        k=k,
                                        verbose=False)
            index += step_length
        else:
            query_results = model.query(reference_data[index:],
                                        k=k,
                                        verbose=False)
            index = len(reference_data)
        assert len(query_results) % k == 0, 'length error!'
        for i in range(int(len(query_results) / k)):
            category = [
                reference_data[query_results[i * k + j]['reference_label']]
                ['path'].split('/')[3] for j in range(k)
            ]
            if category[0] == category[1] or (('Faces' in category[0]) and
                                              ('Faces' in category[1])):
                correct += 1
            else:
                mistake += 1
            for j in range(k):
                distance += query_results[i * k + j]['distance']

        if (index + 1) % 1000 == 0:
            print(str(index + 1) + ' completed!')

    print('正确个数为:' + str(correct))
    print('错误个数为:' + str(mistake))
    print('正确率为:' + str(correct / (correct + mistake)))
    print('平均距离为: ' + str(distance / len(reference_data)))
Ejemplo n.º 30
0
def convert_magic_page():
    try:
        data_id = request.args.get('data_id')
        my_data = UserData.query.filter_by(id=data_id).first()
        my_model = TrainedModel()
        form = TrainModelForm(request.form, obj=my_model)
        data_frame = tc.load_sframe(my_data.sname)
        names=data_frame.column_names()
        types=data_frame.column_types()

        if request.method == 'POST':
            magic = str(request.form['magic'])
            name = str(request.form['name'])
            for feature in request.form.getlist('features'):
                orig_data = data_frame[str(feature)]
                norig_data = orig_data.to_numpy()
                new_data = []
                for item in norig_data:
                    if str(item) == magic:
                        new_data.append(None)
                    else:
                        new_data.append(item)
                sa = SArray(new_data)
                data_frame[str(feature)] = sa
            fwd_id = save_data(my_data, name, data_frame)
            flash('Successfully cleared magic values!', 'success')
            return redirect(url_for('data.data_details_page', data_id=fwd_id))

        return render_template('pages/data/transforms/convert_magic.html',
            my_data=my_data,
            data_frame=data_frame,
            names=names,
            types=types,
            form=form)
    except Exception as e:
        flash('Opps!  Something unexpected happened.  On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error')
        error = ErrorLog()
        error.user_id = current_user.id
        error.error = str(e.__class__)
        error.parameters = request.args
        db.session.add(error)
        db.session.commit()
        return redirect(request.referrer)