def read_articles():
    # connect to Hive and read the articles
    print('>>> reading articles from Hive...')
    cursor = hive.connect('quickstart.cloudera').cursor()

    hive_query = ("SELECT title, publishedAt, content FROM article "
                  "WHERE unix_timestamp(publishedAt) > cast(CURRENT_TIMESTAMP AS BIGINT) - "
                  "{lookback_period}"
                  )
    hive_query = hive_query.format(
        lookback_period=LOOKBACK_DAYS * 24 * 60 * 60)
    cursor.execute(hive_query)
    res = cursor.fetchall()
    print("<<< %d articles read" % len(res))

    # convert the publication date to a date-time
    articles = pd.DataFrame(res, columns=['title', 'publishedAt', 'content'])
    articles['publishedAt'] = pd.to_datetime(articles['publishedAt'])
    articles['publishedAtHour'] = articles['publishedAt'].dt.strftime(
        "%Y-%m-%d %H")

    # cont how many articles published each hour
    articles_grouped = articles['title'].groupby(articles['publishedAtHour'])
    articles_cnt_hourly = articles_grouped.count()
    # draw a line-plot with the results
    # articles_cnt_hourly.plot.line()

    # count the most occuring words
    content = [c for c in articles['content'].tolist() if c is not None]
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(content)

    word_counts = loads(tokenizer.get_config()['word_counts'])
    most_frequent_words = {k: v for k, v in sorted(
        word_counts.items(), key=lambda item: item[1], reverse=True)}
    words, counts = zip(*most_frequent_words.items())
    words = words[:MAX_PLOT_WORDS]
    counts = counts[:MAX_PLOT_WORDS]

    return articles_cnt_hourly, words, counts
Exemple #2
0
                        default=5,
                        help="Epochs to train the model.")
    args = parser.parse_args()
    return args


for es in english_sentences:
    es = preprocess_sentence(es)

for gs in german_sentences:
    gs = start_token + preprocess_sentence(gs) + end_token

tokenizer = Tokenizer(filters='')

tokenizer.fit_on_texts(german_sentences)
config = tokenizer.get_config()
word_index = json.loads(config['word_index'])
index_words = json.loads(config['index_word'])

num_samples = 5
inx = np.random.choice(len(english_sentences), num_samples, replace=False)
print(inx)

sequences = tokenizer.texts_to_sequences(german_sentences)
padded = pad_sequences(sequences, padding='post', value=0)


def map_embedding_f(x, y):
    inp = []
    pad = tf.pad(x, paddings=[[13 - tf.shape(x)[0], 0][0, 0]], mode='CONSTANT')
    inp.append(pad)
Exemple #3
0
sentence_data=string_data.split(".")

#print(sentence_data)

additional_filters='-''""'

token = Tokenizer(num_words=None,
                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + additional_filters,
                  lower=True,
                  split=" ",
                  char_level=False,
                  oov_token="UNK",
                  document_count=0)
token.fit_on_texts(sentence_data)

tokenizer_config = token.get_config()
print(tokenizer_config.keys())

#print(tokenizer_config["word_index"])

#print("\n\n\n\n\n\n\n")

import json
word_counts = json.loads(tokenizer_config['word_counts'])
#print(word_counts)
print(word_counts["the"])

index_word = json.loads(tokenizer_config['index_word'])
word_index=json.loads(tokenizer_config["word_index"])
#print(sentence_data)
Exemple #4
0
class Transformer():
    def __init__(self):
        # try to load data from .env

        self.load_data()

        self.tokenizer = Tokenizer(num_words=1000, lower=True)
        return

    def load_data(self):
        """A function that takes the DATABASE_URL and fetches the contents of the
        strain_info table then saves it to a df for training the model.
        """
        dotenv.load_dotenv()
        alt = 'DATABASE_URL'
        db_url = getenv("DATASOURCE", default=alt)

        engine = create_engine(db_url)
        df = pd.read_sql("SELECT * FROM strain_info", engine)
        self.df = df
        return df

    def transform(self, document: pd.DataFrame, negative: list,
                  ignore: list) -> pd.DataFrame:
        """A function transforms the features from the input dataframe into a Document-term matrix then takes those 
        Arguments:
        -------------
        document {list} : An array like list of strings representing a document to be transformed
        negitive {list} : the list of negitive features to use in calculating the dtm products
        ignore {list} : a list of features to ignore in the dtm product
        Returns:
        -------------
        combined_scaled {pd.DataFrame} : A dataframe of the transformed document's tfidf
        """

        dtm = [0] * 1000

        for i in document.columns:
            if i in ignore:
                pass
            else:
                # takes the document term frequency and if it is
                # a neg feature then we want to subtract  it from the combined dtm
                if i in negative:
                    dtm -= self.find_dtm(document[i])
                # otherwise i want to add it to the combined dtm
                else:
                    dtm += self.find_dtm(document[i])

        mm = MinMaxScaler()
        combined_scaled_values = mm.fit_transform(dtm)
        combined_scaled_columns = dtm.columns.tolist()
        combined_scaled = pd.DataFrame(combined_scaled_values,
                                       columns=combined_scaled_columns)
        combined_scaled.fillna(0, inplace=True)
        return combined_scaled, document.index.tolist()

    def find_dtm(self, feature):
        """A function to take a feature and tokenize then return a tfidf df of that input
        """
        self.tokenizer.fit_on_texts(feature)
        a = self.tokenizer.texts_to_matrix(feature, mode='tfidf')
        config = self.tokenizer.get_config()
        feature_names = json_normalize(loads(
            config['word_index'])).columns.tolist()
        dtm = pd.DataFrame(a)
        return dtm
Exemple #5
0
Nwords = int(1e4)
tokenizer = Tokenizer(num_words=Nwords, oov_token='<OOV>')
XXX = 5000
reviews, lengths = [], []
i = 0
for dataset in [train_examples]:  #, test_examples]:
    for x, y in tqdm(dataset):
        text = encoder.decode(x)
        reviews.append(text.replace('<br />', ''))
        lengths.append(len(text))
        if i > XXX: break
        i += 1
tokenizer.fit_on_texts(reviews)
word_index = tokenizer.word_index
print(f'Tokenizer found {len(word_index)} different words')
vocabulary = json.loads(tokenizer.get_config()['word_counts'])
x, y = [], []
for k, v in vocabulary.items():
    x.append(k)
    y.append(v)
inds = np.argsort(y)
inds = inds[-Nwords:][::-1]

maxlen = 1100
sequences = tokenizer.texts_to_sequences(reviews)
sequences = pad_sequences(sequences, padding='post', maxlen=maxlen)

# with open('word_index.dict','w') as f:
#    for word,index in word_index.items():
#       f.write(f'{word},{index}\n')
Exemple #6
0
tokenizer.fit_on_texts(train_x_data['token'])

low_count_words = [w for w, c in tokenizer.word_counts.items() if c < 5]
for w in low_count_words:
    del tokenizer.word_index[w]
    del tokenizer.word_docs[w]
    del tokenizer.word_counts[w]
train_sequence = tokenizer.texts_to_sequences(train_x_data['token'])
test_sequence = tokenizer.texts_to_sequences(test_x_data['token'])

sequence_data = dict()
sequence_data['train_seq'] = train_sequence
sequence_data['test_seq'] = test_sequence
sequence_data['train_token_list'] = train_x_data['token'].tolist()
sequence_data['test_token_list'] = test_x_data['token'].tolist()
sequence_data['tokenizer_config'] = tokenizer.get_config()

word_idx = tokenizer.word_index
MAX_SEQUENCE_LENGTH = int(np.median(after_len))
DATA_OUT_PATH = './assets/data/npy_data/{}/'.format(Today)
## Make output save directory
if os.path.exists(DATA_OUT_PATH):
    print("{} -- Folder already exists \n".format(DATA_OUT_PATH))
else:
    os.makedirs(DATA_OUT_PATH, exist_ok=True)
    print("{} -- Folder create complete \n".format(DATA_OUT_PATH))

train_input = pad_sequences(train_sequence,
                            maxlen=MAX_SEQUENCE_LENGTH,
                            padding='post')
train_labels = np.array(train_y)