class EbayService: logger = Logger.of('EbayService') client = EbayClient() predictor = Seq2SeqPredictor.from_file(Config.get_filepath('predictor-model'), Config.get_filepath('predictor-weights')) embeddings = WordEmbeddings.from_file(Config.get_filepath('word2vec')) @classmethod def get_latest_phones(cls) -> List[EbayPhone]: phones = cls.client.get_latest_phones() for phone in phones: phone.formatted_title = cls.__get_formatted_title(phone) cls.__save(phone) return phones @classmethod def __get_formatted_title(cls, phone: EbayPhone): try: return cls.__format_title(phone.title) except Exception as error: cls.logger.error(f'error processing phone "{phone.title}": {error}') return WordEmbeddings.UNKNOWN @classmethod def __format_title(cls, ebay_title: str) -> str: indexes = cls.embeddings.sentences_to_indices([ebay_title]) prediction_ohs = cls.predictor.predict(indexes)[0] prediction = cls.embeddings.ohs_to_sentence(prediction_ohs) return prediction.replace(WordEmbeddings.UNKNOWN, '').replace(WordEmbeddings.EMPTY, '').strip() @classmethod def __save(cls, phone): try: phone.save() except Exception as error: cls.logger.error(f'unable to save: {error}')
class CexService: logger = Logger.of('CexService') client = CexClient() @classmethod def find_match(cls, query: str) -> List[CexPhone]: phones = cls.__query_db(query) return phones if len(phones) > 0 else cls.__query_client(query) @classmethod def __query_db(cls, query): return CexPhone.objects( query__string=query, query__date__gte=Date().minus_days(7).as_date()).all() @classmethod def __query_client(cls, query): phones = cls.client.find_phone(query) for_each(cls.__save, phones) return phones @classmethod def __save(cls, phone): try: phone.save() except Exception as error: cls.logger.error(f'unable to save: {error}')
class PhoneService: logger = Logger.of('PhoneService') PRICE_THRESHOLD = 0.9 ebay_service = EbayService() cex_service = CexService() notification_service = NotificationService() @classmethod def fetch_latest_phones(cls): phones = cls.ebay_service.get_latest_phones() cls.logger.info(f'found {len(phones)} phones') for phone in phones: cls.check_price(phone) @classmethod def check_price(cls, ebay_phone: EbayPhone): cls.logger.info(f'{ebay_phone.title} (£{ebay_phone.price})') cls.logger.info(f' |--> {ebay_phone.formatted_title}') if ebay_phone.is_recognized and ebay_phone.has_trusted_seller: cex_phones = cls.cex_service.find_match(ebay_phone.formatted_title) cex_price = average_cash_price(cex_phones) if cex_price > 0 and cex_price * cls.PRICE_THRESHOLD >= ebay_phone.price: cls.notification_service.send_notification( ebay_phone, cex_price)
class DataSet: logger = Logger.of('DataSet') def __init__(self, embeddings: WordEmbeddings, data_file, x_label='title', y_labels=['brand']): self.data = pd.read_csv(data_file) self.embeddings = embeddings self.vocab_size = self.embeddings.size self.__prepare_x(x_label) self.__prepare_y(y_labels) self.logger.info('DataSet created') def __prepare_x(self, x_label): self.logger.info('preparing X') self.X_raw = self.data[x_label].values self.X_max_len = max_len(self.X_raw) indexed_sentences = self.embeddings.sentences_to_indices(self.X_raw) self.X_indexed = pad_sequences(indexed_sentences, self.X_max_len, dtype='int32') def __prepare_y(self, y_labels): self.logger.info('preparing y') def create_label(row): if 'model' in y_labels and row['model'] == WordEmbeddings.UNKNOWN: return WordEmbeddings.UNKNOWN return ' '.join([ row[label] for label in y_labels if WordEmbeddings.UNKNOWN not in row[label] ]) self.y_raw = [create_label(row) for index, row in self.data.iterrows()] self.y_max_len = max_len(self.y_raw) y_oh = self.embeddings.sentences_to_oh(self.y_raw) y_oh = [ pad_sequences(y_oh_part, maxlen=self.y_max_len, padding='post', value=self.embeddings.get_oh('EMP')) for y_oh_part in np.array_split(y_oh, 10) ] self.y_oh = np.concatenate(y_oh) def get_all(self): return self.X_indexed, self.y_oh def get_train_test_data(self, test_size=0.2): ''' :return: X_train, X_test, y_train, y_test ''' return train_test_split(self.X_indexed, self.y_oh, test_size=test_size, random_state=48)
class EbayClient: client_id: str client_secret: str access_token: AccessToken = None logger = Logger.of('EbayClient') def __init__(self): self.client_id = Config.ebay['client_id'] self.client_secret = Config.ebay['client_secret'] @retry(times=5, wait=10) def __get_access_token(self) -> dict: url = 'https://api.ebay.com/identity/v1/oauth2/token' auth = HTTPBasicAuth(self.client_id, self.client_secret) headers = {'Content-Type': 'application/x-www-form-urlencoded'} payload = { 'grant_type': 'client_credentials', 'scope': 'https://api.ebay.com/oauth/api_scope' } response = requests.post(url=url, auth=auth, headers=headers, data=payload) return response.json()['access_token'] def __update_token(self) -> None: if self.access_token is None or self.access_token.is_expired(): self.logger.info('updating token') token = self.__get_access_token() self.access_token = AccessToken(token) @retry(times=5, wait=10) def __search(self, query, start_time) -> dict: filters = 'conditionIds:{1000|1500|2000|2500|3000|4000|5000}' filters += ',buyingOptions:{FIXED_PRICE}' filters += ',deliveryCountry:GB' filters += ',itemStartDate:[{}]'.format(start_time) filters += ',price:[10..350]' filters += ',priceCurrency:GBP' filters += ',itemLocationCountry:GB' url = 'https://api.ebay.com/buy/browse/v1/item_summary/search' headers = { 'authorization': f'Bearer {self.access_token.token}', 'x-ebay-c-marketplace-id': 'EBAY_GB' } params = {'q': query, 'category_ids': '9355', 'filter': filters} result = requests.get(url=url, params=params, headers=headers) return result.json().get('itemSummaries', []) def get_latest_phones(self, minutes=10) -> list: self.__update_token() phones = self.__search( query='phone', start_time=Date().minus_minutes(minutes).as_iso()) return list(map(EbayPhoneMapper.map, phones))
class NotificationService: logger = Logger.of('NotificationService') webhooks_client = WebhooksClient() @classmethod def send_notification(cls, ebay_phone: EbayPhone, cex_price: str): title = f'{ebay_phone.title}<br>--{ebay_phone.formatted_title}<br>' price = f'<i>Ebay price: {ebay_phone.price} / Cex price: {cex_price}</i>' url = ebay_phone.url cls.logger.info('Sending event') cls.logger.info(f'|--> {title}') cls.logger.info(f'|--> {price}') cls.logger.info(f'|--> {url}') cls.webhooks_client.send_notification(title, price, url)
class ModelPredictor: logger = Logger.of('ModelPredictor') def __init__(self, w2v_model, input_shape=(30, ), output_size=4367): embeddings_layer = self.__embeddings_layer(w2v_model) self.model = self.__build_model(input_shape, embeddings_layer, output_size) @staticmethod def __build_model(input_shape, embeddings_layer, output_size): sentence_indices = Input(input_shape, dtype='int32') model = embeddings_layer(sentence_indices) model = LSTM(256, return_sequences=True)(model) model = Dropout(0.5)(model) model = LSTM(256, return_sequences=False)(model) model = Dropout(0.5)(model) model = Dense(512)(model) model = Dropout(0.3)(model) model = Dense(output_size)(model) model = Activation('softmax')(model) model = Model(inputs=sentence_indices, outputs=model) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model @staticmethod def __embeddings_layer(w2v_model): # vocab_len = len(w2v_model.wv.vocab) # emb_dim = w2v_model.wv.vector_size # emb_matrix = w2v_model.wv.syn0 # embedding_layer = Embedding(vocab_len, emb_dim, trainable=False) # embedding_layer.build((None,)) # embedding_layer.set_weights([emb_matrix]) # return embedding_layer return w2v_model.wv.get_keras_embedding() def train(self, X_train, y_train): self.model.fit(X_train, y_train, epochs=2000, batch_size=32, shuffle=True) def test(self, X_test, y_test): loss, acc = self.model.evaluate(X_test, y_test) self.logger.info() self.logger.info(f'Test loss = {loss}') self.logger.info(f'Test accuracy = {acc}')
class CexClient: logger = Logger.of('CexClient') @retry(times=5, wait=10, default_response=[]) def __search(self, query): url = f'https://wss2.cex.uk.webuy.io/v3/boxes/predictivesearch?' params = {'q': query} result = requests.get(url=url, params=params) response = result.json().get('response', {'akc': 'Failure'}) assert response[ 'ack'] == 'Success', 'Failed to make get request to CEX' self.logger.debug(f'cex response: {response}') data = response.get('data') results = data.get('results', []) if data is not None else [] return results if results is not None else [] def find_phone(self, query): self.logger.debug(f'cex query: {query}') results = self.__search(query) return list( map(lambda result: CexPhoneMapper.map(result, query), results))
from domain.ebay import EbayPhone from domain.phone import PhoneDetails import pandas as pd import numpy as np from utils.text_utils import match_word, tokenize, update_vocabulary from utils.logging import Logger from nlp.embeddings import WordEmbeddings from nlp.training import DataSet from config import Config logger = Logger.of('DataPreparation') def create_embeddings(): titles = pd.read_csv(Config.get_filepath('train-data'))['title'].tolist() titles = ['EMP ' + title + ' EMP' for title in titles] embeddings = WordEmbeddings.from_sentences(titles) embeddings.save(Config.get_filepath('word2vec')) def clean_phones(): logger.info('cleaning phones') EbayPhone.objects().update(details=PhoneDetails()) for phone in EbayPhone.objects(title__contains='*'): phone.update(title=phone.title.replace('*', ' ')) for phone in EbayPhone.objects(title__contains='/'): phone.update(title=phone.title.replace('/', ' ')) for phone in EbayPhone.objects(title__contains='+'): phone.update(title=phone.title.replace('+', ' plus '))
from utils.logging import Logger from time import sleep logger = Logger.of('Retry') def retry(times=5, wait=10, default_response=None): def decorator(http_call): def wrapper(*original_args, **original_kwargs): for count in range(times): try: return http_call(*original_args, **original_kwargs) except Exception as error: logger.error(f'{error}, Retry {count+1}') sleep(wait) return default_response if default_response is not None else {} return wrapper return decorator
class Seq2SeqPredictor: logger = Logger.of('Seq2SeqPredictor') def __init__(self, model): self.model = model self.__compile() self.logger.info('Seq2SeqPredictor created') @classmethod def new(cls, word_embeddings, output_shape): cls.logger.info('creating new Seq2SeqPredictor model') model = Sequential() # model.add(Input(input_shape, dtype='int32')) model.add(word_embeddings.keras_embeddings_layer()) model.add(LSTM(512)) model.add(Dropout(0.5)) model.add(RepeatVector(output_shape[0])) model.add(LSTM(256, return_sequences=True)) model.add(Dropout(0.5)) model.add(TimeDistributed(Dense(1024))) model.add(Dropout(0.3)) model.add(TimeDistributed(Dense(output_shape[1]))) model.add(Activation('softmax')) return Seq2SeqPredictor(model) @classmethod def from_file(cls, model_file, weights_file): cls.logger.info( f'loading Seq2SeqPredictor model from {model_file} and weights from {weights_file}' ) with open(model_file, 'r') as model_file: model_json = model_file.read() model = model_from_json(model_json) model.load_weights(weights_file) return Seq2SeqPredictor(model) def __compile(self): self.logger.info('compiling Seq2SeqPredictor model') self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) def summary(self): self.model.summary() def train(self, X_train, y_train, epochs=50, batch_size=32, shuffle=True): self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, shuffle=shuffle) def test(self, X_test, y_test): loss, acc = self.model.evaluate(X_test, y_test) self.logger.info() self.logger.info(f'test loss = {loss}') self.logger.info(f'test accuracy = {acc}') def predict(self, X): return self.model.predict(X) def save(self, model_file, weights_file): self.logger.info( f'saving model to file {model_file} and weights to file {weights_file}' ) model_json = self.model.to_json() with open(model_file, 'w') as file: file.write(model_json) self.model.save_weights(weights_file)
from time import sleep from utils.logging import Logger from services.phone_service import PhoneService logger = Logger.of('MAIN') if __name__ == '__main__': while True: PhoneService.fetch_latest_phones() logger.info() sleep(560)
from nlp.embeddings import WordEmbeddings from nlp.Seq2SeqPredictor import Seq2SeqPredictor from nlp.training import DataSet from utils.logging import Logger import pandas as pd from config import Config logger = Logger.of('TextProcessing') embeddings = WordEmbeddings.from_file(Config.get_filepath('word2vec')) dataset = DataSet(embeddings, Config.get_filepath('train-data'), y_labels=['brand', 'model', 'memory', 'color', 'network']) x_max_len, y_max_len, vocab_size = dataset.X_max_len, dataset.y_max_len, dataset.vocab_size logger.info(f'max len: {x_max_len}/{y_max_len}, labels count {vocab_size}') X_train, X_test, y_train, y_test = dataset.get_train_test_data(test_size=0.1) # predictor = Seq2SeqPredictor.from_file(Config.get_filepath('predictor-model'), Config.get_filepath('predictor-weights')) predictor = Seq2SeqPredictor.new(embeddings, (y_max_len, vocab_size)) predictor.summary() predictor.train(X_train, y_train, epochs=40) predictor.save(Config.get_filepath('predictor-model'), Config.get_filepath('predictor-weights')) predictor.test(X_test, y_test) test = X_test[0:4000] expected = y_test[0:4000] results = predictor.predict(test) for (inp, exp, res) in zip(test, expected, results): r = embeddings.ohs_to_sentence(res)
class WordEmbeddings: logger = Logger.of('WordEmbeddings') EMPTY = 'EMP' UNKNOWN = 'unknown' def __init__(self, model): self.model = model self.size = len(model.wv.vocab) + 1 self.oh_dict = {} self.logger.info('WordEmbeddings created') @classmethod def from_file(cls, filename): cls.logger.info(f'loading WordEmbeddings from file {filename}') model = Word2Vec.load(filename) return WordEmbeddings(model) @classmethod def from_sentences(cls, sentences, size=25, window=5, min_count=1): cls.logger.info('creating new WordEmbeddings from text') processes_sentences = tokenize(sentences) model = Word2Vec(processes_sentences, size=size, window=window, min_count=min_count) return WordEmbeddings(model) def sentences_to_indices(self, sentences): processes_sentences = tokenize(sentences) return np.array([[self.get_index(word) for word in sentence] for sentence in processes_sentences]) def sentences_to_oh(self, sentences): processes_sentences = tokenize(sentences) return np.array([[self.get_oh(word) for word in sentence] for sentence in processes_sentences]) def info(self): self.logger.info(f'number of word vectors: {self.size}') def save(self, filename): self.logger.info(f'saving word embeddings to {filename}') self.model.init_sims(replace=True) self.model.save(filename) def keras_embeddings_layer(self): vocab_len = len(self.model.wv.vocab) emb_dim = self.model.wv.vector_size emb_matrix = self.model.wv.syn0 embedding_layer = Embedding(vocab_len, emb_dim, trainable=True) embedding_layer.build((None, )) embedding_layer.set_weights([emb_matrix]) return embedding_layer # return self.model.wv.get_keras_embedding() def get_index(self, word): try: return self.model.wv.vocab[word].index except Exception as error: self.logger.error(f'unknown word {word}') return self.model.wv.vocab[WordEmbeddings.UNKNOWN].index def get_word(self, index): assert index < self.size, 'index is greater than vocab size' return self.model.wv.index2word[index] def get_vector(self, word): return self.model.wv[word] def get_oh(self, word): if word not in self.oh_dict: index = self.get_index(word) self.oh_dict[word] = create_oh_vector(index, self.size) return self.oh_dict[word] def get_word_for_oh(self, oh_encoding): assert oh_encoding.shape[ 0] == self.size, f'must have a size of {self.size}' index = np.argmax(oh_encoding) return self.get_word(index) def ohs_to_sentence(self, ohs): assert len(ohs.shape) == 2, 'must be an array of ohs' words = [self.get_word_for_oh(oh) for oh in ohs] return ' '.join(words) def indexes_to_sentence(self, indexes): words = [self.get_word(index) for index in indexes] return ' '.join(words)