Exemple #1
0
    def __init__(self, where, queryables=None, encoding=None, scope_level=0):

        where = _validate_where(where)

        self.encoding = encoding
        self.condition = None
        self.filter = None
        self.terms = None
        self._visitor = None

        # capture the environment if needed
        local_dict = DeepChainMap()

        if isinstance(where, Expr):
            local_dict = where.env.scope
            where = where.expr

        elif isinstance(where, (list, tuple)):
            for idx, w in enumerate(where):
                if isinstance(w, Expr):
                    local_dict = w.env.scope
                else:
                    w = _validate_where(w)
                    where[idx] = w
            where = ' & '.join(map('({})'.format, com.flatten(where)))  # noqa

        self.expr = where
        self.env = Scope(scope_level + 1, local_dict=local_dict)

        if queryables is not None and isinstance(self.expr, str):
            self.env.queryables.update(queryables)
            self._visitor = ExprVisitor(self.env, queryables=queryables,
                                        parser='pytables', engine='pytables',
                                        encoding=encoding)
            self.terms = self.parse()
Exemple #2
0
    def __init__(self, lhs, rhs, truediv, *args, **kwargs):
        super(Div, self).__init__('/', lhs, rhs, *args, **kwargs)

        if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type):
            raise TypeError("unsupported operand type(s) for {0}:"
                            " '{1}' and '{2}'".format(self.op,
                                                      lhs.return_type,
                                                      rhs.return_type))

        if truediv or PY3:
            _cast_inplace(com.flatten(self), np.float_)
Exemple #3
0
    def __init__(self, lhs, rhs, truediv, *args, **kwargs):
        super().__init__('/', lhs, rhs, *args, **kwargs)

        if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type):
            raise TypeError("unsupported operand type(s) for {0}:"
                            " '{1}' and '{2}'".format(self.op,
                                                      lhs.return_type,
                                                      rhs.return_type))

        # do not upcast float32s to float64 un-necessarily
        acceptable_dtypes = [np.float32, np.float_]
        _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_)
Exemple #4
0
def _align(terms):
    """Align a set of terms"""
    try:
        # flatten the parse tree (a nested list, really)
        terms = list(com.flatten(terms))
    except TypeError:
        # can't iterate so it must just be a constant or single variable
        if isinstance(terms.value, pd.core.generic.NDFrame):
            typ = type(terms.value)
            return typ, _zip_axes_from_type(typ, terms.value.axes)
        return np.result_type(terms.type), None

    # if all resolved variables are numeric scalars
    if all(term.isscalar for term in terms):
        return _result_type_many(*(term.value for term in terms)).type, None

    # perform the main alignment
    typ, axes = _align_core(terms)
    return typ, axes
Exemple #5
0
    def __init__(self, where, queryables=None, encoding=None, scope_level=0):

        where = _validate_where(where)

        self.encoding = encoding
        self.condition = None
        self.filter = None
        self.terms = None
        self._visitor = None

        # capture the environment if needed
        local_dict = DeepChainMap()

        if isinstance(where, Expr):
            local_dict = where.env.scope
            where = where.expr

        elif isinstance(where, (list, tuple)):
            for idx, w in enumerate(where):
                if isinstance(w, Expr):
                    local_dict = w.env.scope
                else:
                    w = _validate_where(w)
                    where[idx] = w
            where = " & ".join(map("({})".format, com.flatten(where)))  # noqa

        self.expr = where
        self.env = Scope(scope_level + 1, local_dict=local_dict)

        if queryables is not None and isinstance(self.expr, str):
            self.env.queryables.update(queryables)
            self._visitor = ExprVisitor(
                self.env,
                queryables=queryables,
                parser="pytables",
                engine="pytables",
                encoding=encoding,
            )
            self.terms = self.parse()
Exemple #6
0
 def build_dataset_with_keras(self, max_vocab_size=50000):
     text = self.get_raw_text()
     words = text_to_word_sequence(
         text, lower=True, filters='\'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
     words = self.remove_stopwords(words)
     max_vocab_size = min(max_vocab_size, len(set(words)))
     # initialize Tokenizer with 'UNK' as the out-of-vocabulary token.
     # Since Keras reserves the 0th index for padding sequences, the index for 'UNK'
     # will be 1st index
     # max_vocab_size + 1 because Keras reserves the 0th index
     tokenizer = Tokenizer(num_words=max_vocab_size + 1,
                           oov_token='UNK',
                           lower=True,
                           filters='\'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
     sentences = self.parse_file_into_sentences()
     tokenizer.fit_on_texts(sentences)
     sequences = tokenizer.texts_to_sequences(sentences)
     # for downstream compatibility
     flatted_sequences = list(flatten(sequences))
     count = tokenizer.word_counts
     # for downstream compatibility
     filtered_count = {}
     dictionary = {}
     for k, v in tokenizer.word_index.items():
         if v <= max_vocab_size:
             if k == 'UNK':
                 filtered_count['UNK'] = 0
                 dictionary['UNK'] = 1
                 continue
             filtered_count[k] = count[k]
             dictionary[k] = v
         else:
             filtered_count['UNK'] += 1
     reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
     # for downstream compatibility
     count_as_tuples = list(
         zip(list(filtered_count.keys()), list(filtered_count.values())))
     assert max_vocab_size == len(count_as_tuples)
     return flatted_sequences, count_as_tuples, dictionary, reverse_dictionary
Exemple #7
0
def report_corpus(corpus: list, name: str):
    pos_class_df = DataFrame([
        pos for pos in
        flatten([[part[1] for part in line] for line in corpus_data])
    ], columns=["POS"])

    grouped_by_pos_df = pos_class_df.groupby(
        "POS"
    ).size(
    ).to_frame(
        name="frequency"
    ).reset_index(
    ).sort_values(
        by="frequency"
    )

    grouped_by_pos_df.to_excel("./reports/{}-class-stats.xlsx".format(
        name
    ))

    grouped_by_pos_df["POS"] = grouped_by_pos_df.apply(
        lambda row: "{} ({})".format(row["POS"],
        row["frequency"]), axis=1
    )

    sns.barplot(
        data=grouped_by_pos_df,
        x="frequency",
        y="POS",
        orient="h",
        saturation=1,
        palette="tab10",
    ).get_figure(
    ).savefig(
        "./reports/{}-corpus-barplot.svg".format(name)
    )
    
    pass
    def getPatientData(self, patientInfo, categoryKey, dataKey):
        """function to impute some dummy info for missing fields"""

        dfValue = []

        # return NaN if no data
        if len(patientInfo[categoryKey]) == 0:
            return np.nan

        for iEntry in np.arange(0, len(patientInfo[categoryKey])):
            if dataKey in patientInfo[categoryKey][iEntry]:
                dfValue.append(patientInfo[categoryKey][iEntry][dataKey])
                #if expectedKey == 'drugindication':
                #    return [dfValue]
                #return dfValue
            #else:
            #    return -1

        # return NaN if no results
        if len(dfValue) == 0:
            return np.nan

        return set(list(flatten(dfValue)))
Exemple #9
0
 def return_type(self):
     # clobber types to bool if the op is a boolean operator
     if self.op in (_cmp_ops_syms + _bool_ops_syms):
         return np.bool_
     return _result_type_many(*(term.type for term in com.flatten(self)))
Exemple #10
0
]
rule_names = [
    i.split(' ')[0] for i in input[0].replace('departure ', 'd_').replace(
        'arrival ', 'a_').replace('-', ' ').split('\n')
]
my_ticket = list(map(int, input[1].split('\n')[1].split(',')))
tickets = [
    list(map(int, i.split(','))) + [True] for i in input[2].split('\n')[1:]
]
rules = {
    k: [i for i in range(v[0], v[1] + 1)] + [j for j in range(v[2], v[3] + 1)]
    for k, v in zip(rule_names, all_rules)
}

valids = sorted(
    [i for i in range(min(flatten(all_rules)),
                      max(flatten(all_rules)) + 1)])
valid_tickets = list()
for t in tickets:
    for n in t[:-1]:
        if n not in valids:
            t[-1] = False
    if t[-1]:
        valid_tickets.append(t[:-1])
impossible = defaultdict(list)
possible = defaultdict(list)
definitely = defaultdict(int)
for i in range(len(rules)):
    for t in valid_tickets:
        for rule, includes in rules.items():
            if t[i] not in includes:
random_fits = []
for i in range(0, len(unemploy_log_diffset)):
  random_fits.append(np.exp((pm.auto_arima(unemploy_log_diffset[i], start_p=1, start_q=1, max_p=3, max_q=3, m=12,
  n_jobs=-1,
  error_action='ignore',
  suppress_warnings=True, #memory issues cause warnings 
  stepwise=False, random=True, random_state=42, n_fits=10)).predict(n_periods=13)))
  print (i)

#add back to the data to prepare for prophet 
from pandas.core.common import flatten

future_regressor_sets = {}
for i in set(future_dates):
  for j in range(0, len(next_13)):
    future_regressor_sets[i] = list(flatten(([unemploy_log_diffset[i]['Unemployment_rate']],random_fits[j])))

test=future_dates

#put the data in a format the prophet will understand -- ds and unemployment rates including future values
for key in test:
  test[key]['Unemployment_rate']=pd.Series(future_regressor_sets[key])
  
#for key in test:
 # test[key]['Unemployment_rate']=pd.Series(future_regressor_sets[key])
#test.values()

#use prophet to get those predictions
test_f={}
for i in test:
  test_f[i] = prophet_models[i].predict(test[i])
Exemple #12
0
    def __init__(self, lhs, rhs, truediv, *args, **kwargs):
        super(Div, self).__init__('/', lhs, rhs, *args, **kwargs)

        if truediv or PY3:
            _cast_inplace(com.flatten(self), np.float_)
Exemple #13
0
    print('Fix input file')
    print(zedd)
    print('original input')

    xs, cv, kf = run_kf(data=zedd,
                        dim_of_measurements=dim_of_measurements,
                        measured_var=(measured_var),
                        covar=(covar),
                        process_model=(process_model),
                        white_noise_var=white_noise_var,
                        dt=dt,
                        sensor_covar=(sensor_covar),
                        measurement_function=(measurement_function))

    x, p = run_smoother(kf, xs, cv)

    final_x = []
    for i in x:
        final_x.append(list(flatten(i)))

    print(final_x)
    print('type: ', type(final_x))
    print('FINAL_X', final_x[0])
    print(final_x[0][0])
    print(type(p))
    # print('==============+P============', p[0])

    process_files.process_output(final_x, p, output_loc)

    #Kalman.visualise(x, p, zedd, real)
Exemple #14
0
                         header=None)
id_map = dict(zip(id_map_csv.iloc[:, 1], id_map_csv.iloc[:, 0]))

del id_map_csv
gc.collect()
## Getting ID Mapping Files

## Get Training Embeddings IDs
train_feature_dict_dir = "/data/recsys2020/history_nn/TrainChunk*"
train_chunks_dirs = list(sorted(glob.glob(train_feature_dict_dir)))

for i, file in enumerate(train_chunks_dirs):
    print(i, file)
    with open(file, 'rb') as f:
        chunk = joblib.load(f)
    setid = set(flatten(chunk['tweet_ids']))
    print('setid', len(setid))
    setengagenum = set(np.unique(chunk['engagement_histories']))
    print('setengagenum', len(setengagenum))
    setengage = {id_map[k] for k in setengagenum if k in id_map}
    print('setengage', len(setengage))
    setid.update(setengage)
    print('setid', len(setid))

    ids_chunk = np.array(list(setid))
    with open("/data/recsys2020/history_nn/TrainEmbID" + str(i), "wb") as f2:
        joblib.dump(ids_chunk, f2)

    del ids_chunk
    del setid
    del setengagenum
Exemple #15
0
 def names(self):
     """Get the names in an expression"""
     if is_term(self.terms):
         return frozenset([self.terms.name])
     return frozenset(term.name for term in com.flatten(self.terms))
        ele2 = ele1.replace(in_[1], out_[1])
        ele3 = ele2.replace(in_[2], out_[2])
        ele4 = ele3.replace(in_[3], out_[3])
        ele5 = ele4.split('::')
        lis_.append(ele5)
    return lis_


####################################

## read in Selenium scrape data in json format

with open('groupOverview_l.json') as f:
    data = json.loads("[" + f.read().replace("][", "],\n[") + "]")
    #print(data[0][2])
    group = pd.Series(flatten([data[i][0] for i in range(len(data))]))
    OverviewText = pd.Series(flatten([data[i][1] for i in range(len(data))]))
    BGA = pd.Series([data[i][2] for i in range(len(data))])

    TypeNames_ = []
    Method_ = []
    Location_ = []
    OverallScore_ = []
    for ind in range(len(data)):
        TML = multReplaceStr(data[ind][3])
        TypeNames_.append(
            [x.strip(' ') for x in (TML[i][0] for i in range(len(TML)))])
        TypeNames = pd.Series(TypeNames_)
        Method_.append(
            [
                x.upper().strip(' ').replace('JIGS', 'JIG').replace(
Exemple #17
0
 def return_type(self):
     # clobber types to bool if the op is a boolean operator
     if self.op in (_cmp_ops_syms + _bool_ops_syms):
         return np.bool_
     return result_type_many(*(term.type for term in com.flatten(self)))
def tripadvisor(urll):
    html = requests.get(urll).text
    soup = BeautifulSoup(html, "html.parser")
    data = soup.findAll('div', attrs={'class': '_6sUF3jUd'})

    hotel_name = []

    hotel_link = []

    for div in data:
        links = div.findAll('h2')
        for a in links:
            hotel_name.append(a.text)

    for div in data:
        links = div.findAll('a', attrs={'class': '_1QKQOve4'})
        for a in links:
            hotel_link.append("https://www.tripadvisor.fr" + a['href'])

    hotel_data = pd.DataFrame(list(zip(hotel_name, hotel_link)),
                              columns=['Hotel Name', 'Hotel Link'])
    #url = 'https://www.tripadvisor.fr/Attraction_Review-g196629-d12447489-Reviews-Canyoning_Saint_Lary-Saint_Lary_Soulan_Hautes_Pyrenees_Occitanie.html'

    #url = 'https://www.tripadvisor.fr/Attraction_Review-g1841271-d13280719-Reviews-Escape_Dimension_La_Croisee_des_Mondes-Saleilles_Perpignan_Pyrenees_Orientales_.html'
    mail = []
    phone = []
    website = []
    address = []
    for i in hotel_data['Hotel Link']:
        html_data = requests.get(i).text
        data = re.search(r'window\.__WEB_CONTEXT__=(\{.*?\});',
                         html_data).group(1)
        data = json.loads(data.replace('pageManifest', '"pageManifest"'))
        soup = BeautifulSoup(html_data, "html.parser")

        def get_emails(val):
            if isinstance(val, dict):
                for k, v in val.items():
                    if k == 'email':
                        if v:
                            yield v
                    else:
                        yield from get_emails(v)
            elif isinstance(val, list):
                for v in val:
                    yield from get_emails(v)

        def get_phones(val):
            if isinstance(val, dict):
                for k, v in val.items():
                    if k == 'phone':
                        if v:
                            yield v
                    else:
                        yield from get_phones(v)
            elif isinstance(val, list):
                for v in val:
                    yield from get_phones(v)

        def get_websites(val):
            if isinstance(val, dict):
                for k, v in val.items():
                    if k == 'website':
                        if v:
                            yield v
                    else:
                        yield from get_websites(v)
            elif isinstance(val, list):
                for v in val:
                    yield from get_websites(v)

        try:

            for email in get_emails(data):
                email = base64.b64decode(email).decode('utf-8')
                email = re.search(r'mailto:(.*)_', email).group(1)
                mail1 = []
                mail1.append(email)
            mail.append(mail1[0])
        except:
            for email in get_emails(data):
                mail1 = []
                mail1.append(email)
            mail.append(mail1[0])

        #print(mail[0])

        try:

            for email in get_phones(data):
                email = base64.b64decode(email).decode('utf-8')
                phone1 = []
                phone1.append(email)
            phone.append(phone1[0])
        except:
            for email in get_phones(data):
                phone1 = []
                phone1.append(email)
            phone.append(phone1[0])

        #print(phone[0])

        try:

            for email in get_websites(data):
                email = base64.b64decode(email).decode('utf-8')
                website1 = []
                website1.append(email)
            website.append(website1[0])
        except:
            for email in get_websites(data):
                website1 = []
                website1.append(email)
            website.append(website1[0])

    #print(website[0])

        try:
            divs = soup.findAll("div", attrs={'class': 'LjCWTZdN'})
            div = divs[0]
            for span in div:
                address1 = []
                address1.append(span.text)
            address.append(address1)
            # try:
            #      for span in soup.findAll("div",attrs={'class':'_2hDw2pmg'}):
            #          address1 = []
            #          address1.append(span.text)
            #      address.append(address1[0])
            # except:
            #      address.append('not available')
        except:
            address.append('not available')

    address = list(flatten(address))
    hotel_data['Hotel Website'] = website
    hotel_data['Hotel Phone'] = phone
    hotel_data['Hotel Email'] = mail
    hotel_data['Hotel Address'] = address
    return hotel_data
def melt(
    frame: "DataFrame",
    id_vars=None,
    value_vars=None,
    var_name=None,
    value_name="value",
    col_level=None,
    ignore_index: bool = True,
) -> "DataFrame":
    # If multiindex, gather names of columns on all level for checking presence
    # of `id_vars` and `value_vars`
    if isinstance(frame.columns, MultiIndex):
        cols = [x for c in frame.columns for x in c]
    else:
        cols = list(frame.columns)

    if value_name in frame.columns:
        warnings.warn(
            "This dataframe has a column name that matches the 'value_name' column "
            "name of the resulting Dataframe. "
            "In the future this will raise an error, please set the 'value_name' "
            "parameter of DataFrame.melt to a unique name.",
            FutureWarning,
            stacklevel=3,
        )

    if id_vars is not None:
        if not is_list_like(id_vars):
            id_vars = [id_vars]
        elif isinstance(frame.columns,
                        MultiIndex) and not isinstance(id_vars, list):
            raise ValueError(
                "id_vars must be a list of tuples when columns are a MultiIndex"
            )
        else:
            # Check that `id_vars` are in frame
            id_vars = list(id_vars)
            missing = Index(com.flatten(id_vars)).difference(cols)
            if not missing.empty:
                raise KeyError("The following 'id_vars' are not present "
                               f"in the DataFrame: {list(missing)}")
    else:
        id_vars = []

    if value_vars is not None:
        if not is_list_like(value_vars):
            value_vars = [value_vars]
        elif isinstance(frame.columns,
                        MultiIndex) and not isinstance(value_vars, list):
            raise ValueError(
                "value_vars must be a list of tuples when columns are a MultiIndex"
            )
        else:
            value_vars = list(value_vars)
            # Check that `value_vars` are in frame
            missing = Index(com.flatten(value_vars)).difference(cols)
            if not missing.empty:
                raise KeyError("The following 'value_vars' are not present in "
                               f"the DataFrame: {list(missing)}")
        if col_level is not None:
            idx = frame.columns.get_level_values(col_level).get_indexer(
                id_vars + value_vars)
        else:
            idx = frame.columns.get_indexer(id_vars + value_vars)
        frame = frame.iloc[:, idx]
    else:
        frame = frame.copy()

    if col_level is not None:  # allow list or other?
        # frame is a copy
        frame.columns = frame.columns.get_level_values(col_level)

    if var_name is None:
        if isinstance(frame.columns, MultiIndex):
            if len(frame.columns.names) == len(set(frame.columns.names)):
                var_name = frame.columns.names
            else:
                var_name = [
                    f"variable_{i}" for i in range(len(frame.columns.names))
                ]
        else:
            var_name = [
                frame.columns.name
                if frame.columns.name is not None else "variable"
            ]
    if isinstance(var_name, str):
        var_name = [var_name]

    N, K = frame.shape
    K -= len(id_vars)

    mdata = {}
    for col in id_vars:
        id_data = frame.pop(col)
        if is_extension_array_dtype(id_data):
            id_data = cast("Series", concat([id_data] * K, ignore_index=True))
        else:
            id_data = np.tile(id_data._values, K)
        mdata[col] = id_data

    mcolumns = id_vars + var_name + [value_name]

    mdata[value_name] = frame._values.ravel("F")
    for i, col in enumerate(var_name):
        # asanyarray will keep the columns as an Index
        mdata[col] = np.asanyarray(
            frame.columns._get_level_values(i)).repeat(N)

    result = frame._constructor(mdata, columns=mcolumns)

    if not ignore_index:
        result.index = tile_compat(frame.index, K)

    return result
Exemple #20
0
            currentRsi = float("{:.2f}".format(stock.rsi[-1]))

            if currentRsi > 75:
                data.append(str(currentRsi) + " 🔥")
            elif currentRsi < 35:
                data.append(str(currentRsi) + " 🧊")
            else:
                data.append(currentRsi)

            chartLink = "https://finance.yahoo.com/quote/" + ticker + "/chart?p=" + ticker

            data.append(chartLink)

            allData.append(data)

            # Shows chart only if current RSI is greater than or less than 70 or 30 respectively
            if currentRsi < 30 or currentRsi > 70:

                stock.graph(MAarr)

        except Exception as e:
            print('Error: ', str(e))

    print(
        tabulate(allData,
                 headers=flatten([
                     'Stock', 'Price', [str(x) + " MA" for x in MAarr], "RSI",
                     "chart"
                 ])))
Exemple #21
0
    def _expand_colspan_rowspan(self, rows, fill_rowspan=True):
        """Given a list of rows, return a list of rows that properly handle
           colspan/rowspan

        Discussion on behavior of fill_rowspan in #17073

        Parameters
        ----------
        rows : list of rows, each of which is a list of elements in that row

        fill_rowspan : boolean
            Should a rowspan fill every item in the rowspan (True) or only the
            bottommost element (False)? Default is True.

        Returns
        -------
        res : list of rows, each of which is a list of elements in that row,
            respecting colspan/rowspan
        """

        res = []
        saved_span = []
        for row in rows:
            extracted_row = self._extract_td(row)
            cols_text = [
                _remove_whitespace(self._text_getter(col))
                for col in extracted_row
            ]
            col_colspans = [
                int(col.get('colspan', 1)) for col in extracted_row
            ]
            col_rowspans = [
                int(col.get('rowspan', 1)) for col in extracted_row
            ]
            # expand cols using col_colspans
            # maybe this can be done with a list comprehension, dunno
            cols = list(
                zip(
                    list(
                        flatten(
                            lmap(lambda text_nc: [text_nc[0]] * text_nc[1],
                                 list(zip(cols_text, col_colspans))))),
                    list(
                        flatten(
                            lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0],
                                 list(zip(col_colspans, col_rowspans)))))))
            # cols is now a list of (text, number of rows)
            # now insert any previous rowspans
            for (col, (text, nr)) in saved_span:
                cols.insert(col, (text, nr))

            # save next saved_span
            def advance_item_to_next_row(item):
                (col, (text, nr)) = item
                if nr == 1:
                    return None
                else:
                    # only keep the text around if fill_rowspan is set
                    return (col, (text if fill_rowspan else '', nr - 1))

            saved_span = lfilter(
                lambda i: i is not None,
                lmap(advance_item_to_next_row, list(enumerate(cols))))
            cols = [text for (text, nr) in cols]
            # generate cols with text only
            if any([col != '' for col in cols]):
                res.append(cols)
        return res
Exemple #22
0
    for k in max_types
}
print('normed all maximum loss types (stoch+cont): \n', normed_max_types, '\n')

print('all maximum loss types (stoch): \n', max_types_no_cont, '\n')
normed_max_types_no_cont = {
    k: np.round(max_types_no_cont[k] / sum(max_types_no_cont.values()), 2)
    for k in max_types_no_cont
}
print('normed all maximum loss types (stoch): \n', normed_max_types_no_cont,
      '\n')

# In[9]:

bins = np.logspace(-9, 1, 30)
plt.hist(list(flatten(opening_typ_dict['stoch_opening'])),
         bins=bins,
         histtype='step',
         label='stoch op')
plt.hist(list(flatten(opening_typ_dict['cont_opening'])),
         bins=bins,
         histtype='step',
         label='cont op')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('opening angle of each interaction in degree')
plt.ylabel('counts')
plt.legend()
events_s = len([i for i in opening_typ_dict['stoch_opening'] if len(i) > 0])
events_c = len([i for i in opening_typ_dict['cont_opening'] if len(i) > 0])
plt.title('stoch events: {}, cont events: {}'.format(events_s, events_c))
Exemple #23
0
 def operand_types(self):
     return frozenset(term.type for term in com.flatten(self))
Exemple #24
0
for root, dirs, files in os.walk(all_paths_dir):
    for file in files:
        if file.endswith(".pkl"):
            all_paths_filenames.append(os.path.join(root, file))

img_paths_train = []
mask_paths_train = []
img_labels_train = []
for train_path in all_paths_filenames[:-1]:
    with open(train_path, 'rb') as alp:
        all_paths_splits = pickle.load(alp)
        img_paths_train.append(all_paths_splits['image paths'])
        mask_paths_train.append(all_paths_splits['mask paths'])
        img_labels_train.append(all_paths_splits['image labels'])

img_paths_train = list(flatten(img_paths_train))
mask_paths_train = list(flatten(mask_paths_train))
img_labels_train = list(flatten(img_labels_train))

with open(all_paths_filenames[-1], 'rb') as alp:
    all_paths_splits = pickle.load(alp)
    img_paths_valid = all_paths_splits['image paths']
    mask_paths_valid = all_paths_splits['mask paths']
    img_labels_valid = all_paths_splits['image labels']

print("Total training samples : ", len(img_paths_train))
print("Total validation samples : ", len(img_paths_valid))


class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
Exemple #25
0
def ComputeVals(backWindowVal, frontWindowVal, dfAOI, df, polys, patient,
                mode):
    #Define the backwindow and front window not necessary but it helps for clarity
    backWindow = backWindowVal
    frontWindow = frontWindowVal
    #Create a marker to split into consecutive AOI groups
    dfAOI['marker'] = (dfAOI['InfoUnit'] != dfAOI['InfoUnit'].shift()).cumsum()
    #Add the first and last of every group into our df_master
    df_master = dfAOI.index.to_series().groupby(dfAOI['marker']).agg(
        ['first', 'last']).reset_index()
    #Add a new column with the patient name on our df_master
    df_master['PATIENT'] = patient
    #Find the  RecordingTimestamp for every first and last index in our df_master and store it in their respective df's
    df_firstIndexTime = df.loc[df_master['first'], ['RecordingTimestamp']]
    df_lastIndexTime = df.loc[df_master['last'], ['RecordingTimestamp']]
    #Add the RecordingTimestamps to our df_master
    df_master = df_master.assign(
        recordingTimeStampStart=df_firstIndexTime.values,
        recordingTimeStampEnd=df_lastIndexTime.values)
    #Assign the target for looking back and front by subtracting the back window and adding the front window, then assigning it to our df_master
    df_master = df_master.assign(
        timeTargetBack=df_master['recordingTimeStampStart'].values -
        backWindow,
        timeTargetFront=df_master['recordingTimeStampEnd'].values +
        frontWindow)
    #Create a copy of the index in the main dataframe to avoid loosing the indices in the future
    df['copy_index'] = df.index
    #Use the function merge_asof to find the back and front closest value of from the target time stored in master_df in comparison to the RecordingTimestamp in the main dataframe
    #this will be stored in their respective new dataframes (we get a dataframe back from this function, this dataframe has only the rows of the main dataframe where we have the closest value to
    # our targets, it is important to note that since we get the row from the main df we have the correct location in the main df after merge because we have the copy of index). **It is basically a
    # left join  but instead of equal values we match based on nearest.
    df_backward = pd.merge_asof(df_master,
                                df,
                                left_on='timeTargetBack',
                                right_on='RecordingTimestamp',
                                direction='backward')
    df_forward = pd.merge_asof(df_master,
                               df,
                               left_on='timeTargetFront',
                               right_on='RecordingTimestamp',
                               direction='forward')
    #Assign the copy of indices which are the correct ones on our df_master
    df_master = df_master.assign(
        backWindowIndex=df_backward['copy_index'].values)
    df_master = df_master.assign(
        frontWindowIndex=df_forward['copy_index'].values)
    #Assign the info unit values for every index in the df_master, this could have been done earlier
    df_master = df_master.assign(InfoUnits=df.loc[df_master['first'],
                                                  'InfoUnit'].values)
    #Create a list to store the smaller dataframes from the back to front windows
    df_list = []
    #Zip the values to create a point in the main dataframe, will be useful for later when finding if it falls on a polygon
    df['point'] = list(
        zip(df['FixationPointX..MCSpx.'], df['FixationPointY..MCSpx.']))
    #Append all the smaller dataframes into the list
    for index, row in df_master.iterrows():
        df_list.append(df.loc[row['backWindowIndex']:row['frontWindowIndex']].
                       drop_duplicates('FixationIndex'))
    #Iterate through the df_list and create a new dataframe of true and false values based on which polygon every row lands on, append this to a list of hits.
    hitsList = []
    for x in df_list:
        pointList = list(map(myPoint, (x['point'].values)))
        _pnts = pointList
        pnts = gp.GeoDataFrame(geometry=_pnts)
        hitsList.append(
            pnts.assign(
                **{key: pnts.within(geom)
                   for key, geom in polys.items()}))
    resList = []
    #We iterate through the true false dataframes and we just keep the truth values as a list
    for data in hitsList:
        res = list(
            flatten(
                pd.DataFrame(data.columns.where(
                    data == True).tolist()).values.tolist()))
        ans = [x for x in res if not isinstance(x, float)]
        noDuplicate = pd.Series(ans).drop_duplicates()
        resList.append(list(noDuplicate))
    #Store the hists into the df_master
    hitName = 'HITS' + '_' + str(backWindow)
    df_master[hitName] = resList
    hitToCSV = df_master[['PATIENT', 'InfoUnits', hitName]]
    #If we just call for hits just return the hits dataframe
    if (mode == 'hits'):
        return hitToCSV
    #Else we call full and now we calculate the latencies
    elif (mode == 'full'):
        df_latency = df_master
        #Drop undefined AOI's
        # df_latency.drop(df_latency.loc[df_latency['InfoUnits']=='KITCHEN'].index, inplace=True)
        # df_latency.drop(df_latency.loc[df_latency['InfoUnits']=='EXTERIOR'].index, inplace=True)
        # df_latency.drop(df_latency.loc[df_latency['InfoUnits']=='CUPBOARD'].index, inplace=True)
        #Call the computeLatency function for back and front
        latencyValuesBack = computeLatency(df, polys, df_latency, 'back')
        latencyValuesFront = computeLatency(df, polys, df_latency, 'front')
        df_master['BackLatency'] = latencyValuesBack[0]
        df_master['BackIndex'] = latencyValuesBack[1]
        df_master['FrontLatency'] = latencyValuesFront[0]
        df_master['FrontIndex'] = latencyValuesBack[1]
        latencyToCSV = df_master[[
            'PATIENT', 'InfoUnits', 'BackLatency', 'FrontLatency'
        ]]
        patientResult = pd.merge(latencyToCSV,
                                 hitToCSV,
                                 right_index=True,
                                 left_index=True)
        patientResult.reset_index()
        resultsToCSV = patientResult.drop(['PATIENT_y', 'InfoUnits_y'], axis=1)
        resultsToCSV = resultsToCSV.rename(columns={
            'PATIENT_x': 'Patient',
            'InfoUnits_x': 'InfoUnits'
        })
        return resultsToCSV
Exemple #26
0
 def operand_types(self):
     return frozenset(term.type for term in com.flatten(self))
Exemple #27
0
def score_web_text(tokens):
    # only use the scraped text out the website

    # table of pool of words from the different company types divided by SIC codes too, see in DATASETS the differences and the references
    table_of_pool_words = [
        pool_words_R_ext, pool_words_I_ext, pool_words_DS_ext,
        pool_words_OS_ext, pool_words_RS_ext, pool_words_DA_ext,
        pool_words_MSP_ext, pool_words_CC_ext, pool_words_MS_ext,
        pool_words_VD_ext, pool_words_TC_ext
    ]

    # transfrom the words in to dataframe
    words_dataframe = pd.DataFrame(table_of_pool_words)

    pool_set = list(flatten(table_of_pool_words))

    unique_words = set()
    for x in pool_set:
        unique_words.add(x)

    gen_weights = np.zeros(len(unique_words))

    counter = 0
    for x in unique_words:
        id = np.where(words_dataframe == x)[0]
        gen_weights[counter] = len(id)
        counter += 1

    list_words = list(unique_words)
    points = np.asarray(list_words)

    table_of_dataframes = []
    # loop over each pool words
    for iweight in range(len(table_of_pool_words)):
        id_pool = return_indices_of_a(list_words, table_of_pool_words[iweight])
        # here I created the pool and the weight for each word for each company type
        weight = 1. / gen_weights[id_pool]
        new_pool = points[id_pool]

        pool = pd.DataFrame([new_pool, weight])
        table_of_dataframes.append(pool)

    # actual sum of all the weights in each pool-words
    value_total = np.zeros(len(table_of_pool_words))

    for itotal in range(len(table_of_pool_words)):
        dataf = table_of_dataframes[itotal]
        value_total[itotal] = np.sum(dataf.iloc[1])

    n_str = len(tokens)

    flag_check = []

    if n_str == 0:
        print('!raised awareness! zero element in the analyzed text')
        flags_checks = np.zeros(len(table_of_pool_words))
    elif n_str > 0:

        if n_str == 1:
            print('!raised awareness! Only 1 element in the analyzed text')

        for ipool in range(len(table_of_pool_words)):
            dataf = table_of_dataframes[ipool]
            id_0 = return_indices_of_a(dataf.iloc[0], tokens)
            values = np.sum(dataf.iloc[1][id_0])
            flag_check.append(values / value_total[ipool])

        flags_checks = 100 * np.asarray(flag_check)

    return flags_checks
Exemple #28
0

# I will run the distance_coord function for every element (DataFrame) in the gdf_list.

total_distance=[]

for item in gdf_list:
    total_distance.append(distance_coord(item))


# In[21]:


# I will flatten the total_distance list of lists to a single flat list.

total_distance = list(flatten(total_distance))


# In[22]:


# I will create a function to compute the tmin, tmax and timedelta for every user.

def compute_tmin_tmax_timedelta(df):
    'This function computes the min, max timestamp and the timedelta for evry user'
    tmin = df['datetime'].min()
    tmax = df['datetime'].max()
    timedelta = tmax-tmin
    return tmin, tmax, timedelta
    
Exemple #29
0
    if args.overleaf:
        clear_directory(document_path)
        fetch_overleaf(args.overleaf, document_path)

    temp_path = create_temporary_copy(document_path)

    conf_source_path = os.path.join(document_path, "variables.json")
    with open(conf_source_path) as f:
        conf_source = json.load(f)

    # DataFrame initialisation
    cols = conf_source["booleans"] + \
        list(conf_source["numbers"].keys()) + \
        list(conf_source["enums"].keys()) + \
        list(flatten(conf_source["choices"])) + \
        ["nbPages", "space"]
    df = pd.DataFrame(columns=cols)

    file_path = os.path.join(temp_path, filename)
    inject_space_indicator(file_path)
    # LaTeX bbl pregeneration
    generate_bbl(file_path)

    # ----------------------------------------
    # PDF generation
    # ----------------------------------------
    if args.config:
        row = generate_pdf(json.loads(args.config), filename, temp_path)
        pdf_name = filename + ".pdf"
        shutil.copyfile(os.path.join(temp_path, pdf_name),
def emo_analysis(input_text):
    tokenizer = BertTokenizer.from_pretrained(
        "monologg/bert-base-cased-goemotions-original")
    model = BertForMultiLabelClassification.from_pretrained(
        "monologg/bert-base-cased-goemotions-original")

    goemotions = MultiLabelPipeline(model=model,
                                    tokenizer=tokenizer,
                                    threshold=0.3)

    #결과확인
    ##print(goemotions(texts))
    ########## 여기서는 최초 입력 에세이를 적용한다. input_text !!!!!!!!
    re_text = input_text.split(".")

    #데이터 전처리
    def cleaning(datas):

        fin_datas = []

        for data in datas:
            # 영문자 이외 문자는 공백으로 변환
            only_english = re.sub('[^a-zA-Z]', ' ', data)

            # 데이터를 리스트에 추가
            fin_datas.append(only_english)

        return fin_datas

    texts = cleaning(re_text)

    #분석된 감정만 추출
    emo_re = goemotions(texts)

    emo_all = []
    for list_val in range(0, len(emo_re)):
        ##print(emo_re[list_val]['labels'],emo_re[list_val]['scores'])
        #mo_all.append((emo_re[list_val]['labels'],emo_re[list_val]['scores'])) #KEY, VALUE만 추출하여 리스트로 저장
        #emo_all.append(emo_re[list_val]['scores'])
        emo_all.append((emo_re[list_val]['labels']))

    from pandas.core.common import flatten  #이중리스틀 FLATTEN하게 변환
    flat_list = list(flatten(emo_all))
    #중립적인 감정을 제외하고, 입력한 문장에서 다양한 감정을 모두 추출하고 어떤 감정이 있는지 계산

    unique = []
    for r in flat_list:
        if r == 'neutral':
            pass
        else:
            unique.append(r)

    #빈도수 계산하여 오름차순 정렬

    from collections import Counter

    count = Counter(unique)
    words = dict(count.most_common())

    ######  워드크라우드 구현  start  #####
    # #분석가능한 감정 총 감정 수 - Bert origin model 적용시 28개 감정 추출돰
    # total_num_emotion_analyzed = 28

    # ########## wodCloud 설정 ########
    # from wordcloud import WordCloud

    # import matplotlib.pyplot as plt

    # import nltk
    # from nltk.corpus import stopwords
    # %matplotlib inline

    # import matplotlib
    # from IPython.display import set_matplotlib_formats
    # matplotlib.rc('font',family = 'Malgun Gothic')

    # set_matplotlib_formats('retina')

    # matplotlib.rc('axes',unicode_minus = False)

    # #문장의 핵심감정을 워드크라우드로 표현(큰 글자가 가장 빈도수가 많이 나온 분석결과다)

    # wordcloud = WordCloud(background_color='white',
    #                     colormap = "Accent_r",
    #                     width=1500, height=1000).generate_from_frequencies(words)

    # plt.imshow(wordcloud)
    # plt.axis('off')
    # plt.show()
    ######  워드크라우드 구현  end  #####

    # 에세이에 표현된 핵심 감정값 도출 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    return words
Exemple #31
0
    def __init__(self, lhs, rhs, truediv=True, *args, **kwargs):
        super(Div, self).__init__('/', lhs, rhs, *args, **kwargs)

        if truediv or PY3:
            _cast_inplace(com.flatten(self), np.float_)
# Split products into terms: Tokenize.
products['products_mod'] = products['products_mod'].str.split()

# # Merge the synonyms perse
# departments_synonyms = departments_synonyms.groupby('department')['synonyms'].apply(list)
# departments_synonyms = pd.merge(departments, departments_synonyms, on="department", how='outer').fillna('')

# Merge the department and aisle names into the dataframe. 
products = pd.merge(products, departments, on="department_id", how='outer')
products = pd.merge(products, aisles, on="aisle_id", how='outer')

# https://stackoverflow.com/a/43898233/3780957
# https://stackoverflow.com/a/57225427/3780957
# Remove synonyms here in the list
products['products_mod'] = products[['products_mod', 'aisle', 'department']].values.tolist()
products['products_mod'] = products['products_mod'].apply(lambda x:list(flatten(x)))

# %%
# Steam and lemmatisation of the product name
# https://stackoverflow.com/a/24663617/3780957
# https://stackoverflow.com/a/25082458/3780957
# https://en.wikipedia.org/wiki/Lemmatisation

lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')
products['products_lemma'] = products['products_mod'].apply(lambda row:[lemma.lemmatize(item) for item in row])
products['products_lemma'] = products['products_lemma'].apply(lambda row:[sno.stem(item) for item in row])

# %%
## EDA ----
Exemple #33
0
def melt(
    frame: DataFrame,
    id_vars=None,
    value_vars=None,
    var_name=None,
    value_name="value",
    col_level=None,
) -> DataFrame:
    # TODO: what about the existing index?
    # If multiindex, gather names of columns on all level for checking presence
    # of `id_vars` and `value_vars`
    if isinstance(frame.columns, ABCMultiIndex):
        cols = [x for c in frame.columns for x in c]
    else:
        cols = list(frame.columns)

    if id_vars is not None:
        if not is_list_like(id_vars):
            id_vars = [id_vars]
        elif isinstance(frame.columns,
                        ABCMultiIndex) and not isinstance(id_vars, list):
            raise ValueError(
                "id_vars must be a list of tuples when columns are a MultiIndex"
            )
        else:
            # Check that `id_vars` are in frame
            id_vars = list(id_vars)
            missing = Index(com.flatten(id_vars)).difference(cols)
            if not missing.empty:
                raise KeyError("The following 'id_vars' are not present"
                               " in the DataFrame: {missing}"
                               "".format(missing=list(missing)))
    else:
        id_vars = []

    if value_vars is not None:
        if not is_list_like(value_vars):
            value_vars = [value_vars]
        elif isinstance(frame.columns,
                        ABCMultiIndex) and not isinstance(value_vars, list):
            raise ValueError(
                "value_vars must be a list of tuples when columns are a MultiIndex"
            )
        else:
            value_vars = list(value_vars)
            # Check that `value_vars` are in frame
            missing = Index(com.flatten(value_vars)).difference(cols)
            if not missing.empty:
                raise KeyError("The following 'value_vars' are not present in"
                               " the DataFrame: {missing}"
                               "".format(missing=list(missing)))
        frame = frame.loc[:, id_vars + value_vars]
    else:
        frame = frame.copy()

    if col_level is not None:  # allow list or other?
        # frame is a copy
        frame.columns = frame.columns.get_level_values(col_level)

    if var_name is None:
        if isinstance(frame.columns, ABCMultiIndex):
            if len(frame.columns.names) == len(set(frame.columns.names)):
                var_name = frame.columns.names
            else:
                var_name = [
                    "variable_{i}".format(i=i)
                    for i in range(len(frame.columns.names))
                ]
        else:
            var_name = [
                frame.columns.name
                if frame.columns.name is not None else "variable"
            ]
    if isinstance(var_name, str):
        var_name = [var_name]

    N, K = frame.shape
    K -= len(id_vars)

    mdata = {}
    for col in id_vars:
        id_data = frame.pop(col)
        if is_extension_array_dtype(id_data):
            id_data = concat([id_data] * K, ignore_index=True)
        else:
            id_data = np.tile(id_data.values, K)
        mdata[col] = id_data

    mcolumns = id_vars + var_name + [value_name]

    mdata[value_name] = frame.values.ravel("F")
    for i, col in enumerate(var_name):
        # asanyarray will keep the columns as an Index
        mdata[col] = np.asanyarray(
            frame.columns._get_level_values(i)).repeat(N)

    return frame._constructor(mdata, columns=mcolumns)
Exemple #34
0
            intro_sel = intro[id_sic[element]]

            if intro_sel:
                pool = Pool(os.cpu_count())
                # cleaning process of the introduction

                pool.apply_async(worker, (
                    intro_sel,
                    all_token,
                ))

        pool.close()
        pool.join()
        all_tks.append(all_token)

        token_dataframe = pd.DataFrame(list(flatten(all_tks)))
        # here you can create the dataframe for N-Grams
        #bigrams_dataframe = pd.DataFrame(bi_grams)
        print(type(token_dataframe), len(token_dataframe))
        counts_words = []

        for toks in range(len(token_dataframe.iloc[0])):
            value_token = token_dataframe[toks].value_counts()

            # count each singular words
            counts_words.append(value_token)

        Counted_words = pd.DataFrame(counts_words)

        Counted_words.fillna(value=0, inplace=True)
Exemple #35
0
 def return_type(self):
     # clobber types to bool if the op is a boolean operator
     if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS):
         return np.bool_
     return result_type_many(*(term.type for term in com.flatten(self)))
Exemple #36
0
meals_ds['Price Range'] = meals_ds['Price Range'].replace(
    to_replace=dict_price)  # replace all values by dictionary

print('Fill empty values of median by town')
for city in city_set:
    median_ = meals_ds[meals_ds['City'] == city]['Price Range'].median()
    indexes = meals_ds.loc[meals_ds['City'] == city].index
    meals_ds.loc[indexes,
                 'Price Range'] = meals_ds.loc[indexes,
                                               'Price Range'].fillna(median_)

# prepare and analyze Cuisine Style
print('Creating cuisine style set')
meals_ds['Cuisine Style'] = meals_ds['Cuisine Style'].apply(
    str_to_list)  # Turn all values in column into list
cuisine_set = set(list(flatten(meals_ds['Cuisine Style'].dropna().values))
                  )  # Create a set with cuisine styles

print('Create dummies for Cuisine Style')
for item in cuisine_set:
    meals_ds[item] = meals_ds['Cuisine Style'].apply(find_item)

print('Fill empty values by most popular cuisine')
# Group cuisines by city and fill empty values by most popular cuisine in the city
for city in city_set:
    city_cuisine = pop_cuisine(city)
    indexes = meals_ds.loc[meals_ds['City'] == city].index
    # Dummy value for most popular cuisine is 1. For other cuisines value is 0
    meals_ds.loc[indexes, city_cuisine] = meals_ds.loc[indexes,
                                                       city_cuisine].fillna(1)
    meals_ds.loc[indexes, cuisine_set] = meals_ds.loc[indexes,
Exemple #37
0
 def names(self):
     """Get the names in an expression"""
     if is_term(self.terms):
         return frozenset([self.terms.name])
     return frozenset(term.name for term in com.flatten(self.terms))
def flatten_to_unique(input_list):
    return_list = []
    for item in set(flatten(input_list)):
        return_list.append({item})
    return return_list