def _get_authors(self, authors_path, num_partitions, n_partition): self.authors = np.load(authors_path) printer.print_progress(f'Got {len(self.authors)} unique authors') self.authors = np.array_split(self.authors, num_partitions) printer.print_warning(f'Number of parititions: {len(self.authors)}') self.authors = self.authors[n_partition] printer.print_progress(f'Got {len(self.authors)} unique authors')
def __init__(self, input_path, output_path): self.docs = pd.read_csv(input_path) self.docs = self.docs.fillna('0') self.comment_ids = self.docs['comment_id'] self.docs = self.docs['comment_text'].tolist() self.tokenized_docs = [] printer.print_progress('Text loaded') self.output_path = output_path
def start(self): printer.print_progress('Start collecting') with open(self.input_path, 'rb') as f: f.readline() # move over header for _ in tqdm(range(self.number_of_lines)): offset = f.tell() line = f.readline().decode('utf-8') if not line: break comment_id = self._parse_line(line)[0] self.output.append([comment_id, offset]) self._save()
def make_prep(self): current_pos = 0 resp = get_url_response( VkWorker.api_url, { "method": "groups.getMembers", "params": "group_id={id}&count=1".format(id=self.group_id), "token": self.token, "api_version": self.version }) total_amount = extract_total(resp) printer = print_progress() printer.send(None) return current_pos, total_amount, printer
def _get_authors(self): comment_count_df = self.train_data_df.groupby('author_id')['comment_id'].count() comment_count_df = comment_count_df.reset_index() comment_count_df = comment_count_df[comment_count_df['comment_id'] >= self.min_comment_count] self.authors = comment_count_df['author_id'].unique() printer.print_progress(f'Got {len(self.authors)} unique authors')
def _prepare_data(self): self.train_data_df['timestamp'] = pd.to_datetime(self.train_data_df['timestamp']) printer.print_progress('Data prepared')
def _write_header(self): with open(self.output_path, 'w', encoding='utf-8 ') as f: writer = csv.writer(f) writer.writerow(['comment_id', 'comment_text']) printer.print_progress('Header added')
def _prepare_data(self): self.data_df['timestamp'] = pd.to_datetime(self.data_df['timestamp']) self.data_df = self.data_df.sort_values(by='timestamp', ascending=False) printer.print_progress('Data prepared')
def _prepare_data(self): self.train_data_df_top_comments['timestamp'] = pd.to_datetime( self.train_data_df_top_comments['timestamp']) self.train_data_df_top_comments = self.train_data_df_top_comments.sort_values( by='timestamp', ascending=False) printer.print_progress('Data prepared')
def make_tf_idf(self): printer.print_progress('Run TFIDF Model') self.model = TfidfModel(self.corpus, normalize=False) printer.print_success('Finished to create corpus')
def _load_offsets(self): offset_df = pd.read_csv(self.offset_path) self.offset_dict = offset_df.set_index( 'comment_id').to_dict()['offset'] printer.print_progress('Offsets loaded')