def get_url_feature(content_id=0, content_data=''): """ status: required @todo: """ url = '' #get data if content_id: url = Content.get_content_by_id(content_id).url elif content_data: url = content_data if not url: return {} #extract text from url path features url_path = link_to_text(url) text_result = text_analysis(url_path, var_name='url', head_body='head') #extract url features url_result = url_analysis(url, var_name='url') #combine dict data url_dict = dict(text_result.items() + url_result.items()) return url_dict
def get_url_feature(content_id=0, content_data=''): """ status: required @todo: """ url = '' #get data if content_id: url = Content.get_content_by_id(content_id).url elif content_data: url = content_data if not url: return {} #extract text from url path features url_path = link_to_text(url) text_result = text_analysis(url_path, var_name='url', head_body='head') #extract url features url_result = url_analysis(url, var_name='url') #combine dict data url_dict = dict(text_result.items() + url_result.items()) return url_dict
def get_body(): """ parameters: html - extract text from html returns: <text><social shares> """ result = [] ids = [content.id for content in Content.query.all()] for content_id in ids: content_data = Content.get_content_by_id(content_id) result.append([html_to_text(content_data), content_data.real_shares]) return result
def get_body(): """ parameters: html - extract text from html returns: <text><social shares> """ result = [] ids = [content.id for content in Content.query.all()] for content_id in ids: content_data = Content.get_content_by_id(content_id) result.append([html_to_text(content_data), content_data.real_shares]) return result
def get_description_feature(content_id=0, content_data=''): """ status: optional @todo: """ desc_data = '' #get data if content_id: desc_data = Content.get_content_by_id(content_id).description #use title data for analysis if not desc_data: desc_data = Content.get_content_by_id(content_id).title elif content_data: desc_data = content_data if not desc_data: desc_data = '' #extact feature desc_dict = text_analysis(desc_data, var_name='desc', head_body='head') return desc_dict
def get_description_feature(content_id=0, content_data=''): """ status: optional @todo: """ desc_data = '' #get data if content_id: desc_data = Content.get_content_by_id(content_id).description #use title data for analysis if not desc_data: desc_data = Content.get_content_by_id(content_id).title elif content_data: desc_data = content_data if not desc_data: desc_data = '' #extact feature desc_dict = text_analysis(desc_data, var_name='desc', head_body='head') return desc_dict
def get_icon_feature(content_id=0, content_data=''): """ status: optional """ data = '' #get data if content_id: data = Content.get_content_by_id(content_id).icon_url elif content_data: data = content_data #return result if data: return {'icon': 1} else: return {'icon': 0}
def get_icon_feature(content_id=0, content_data=''): """ status: optional """ data = '' #get data if content_id: data = Content.get_content_by_id(content_id).icon_url elif content_data: data = content_data #return result if data: return {'icon': 1} else: return {'icon': 0}
def update_parent_cluster(clusters, contents, session): """ """ id_cluster = format_cluster(clusters) content_ids = [row[0] for row in contents] for content_id in content_ids: content = Content.get_content_by_id(content_id) if not content.parent_cluster: content.parent_cluster = id_cluster[content_id] if content_id in id_cluster else 0 session.add(content) else: #update contents previously clustered if content_id in id_cluster: clustered_contents = Content.get_content_by_parent_cluster(content.parent_cluster) for clustered_content in clustered_contents: clustered_content.parent_cluster = id_cluster[content_id] session.add(clustered_content) session.commit()
def get_title_feature(content_id=0, content_data=''): """ status: required """ title = '' #get data if content_id: title = Content.get_content_by_id(content_id).title elif content_data: title = content_data if not title: return {} #extract text feature title_dict = text_analysis(title, var_name='title', head_body='head') return title_dict
def get_title_feature(content_id=0, content_data=''): """ status: required """ title = '' #get data if content_id: title = Content.get_content_by_id(content_id).title elif content_data: title = content_data if not title: return {} #extract text feature title_dict = text_analysis(title, var_name='title', head_body='head') return title_dict
def update_parent_cluster(clusters, contents, session): """ """ id_cluster = format_cluster(clusters) content_ids = [row[0] for row in contents] for content_id in content_ids: content = Content.get_content_by_id(content_id) if not content.parent_cluster: content.parent_cluster = id_cluster[ content_id] if content_id in id_cluster else 0 session.add(content) else: #update contents previously clustered if content_id in id_cluster: clustered_contents = Content.get_content_by_parent_cluster( content.parent_cluster) for clustered_content in clustered_contents: clustered_content.parent_cluster = id_cluster[content_id] session.add(clustered_content) session.commit()
def get_data(news_number=1000): """ todo: get text from db? html? """ data = [] counter = 0 ids = id_from_database() for news_id in ids: if counter >= news_number: break content = Content.get_content_by_id(news_id) title = content.title description = content.description if not description: description = title data.append([news_id, title, description]) counter += 1 return data
def get_data(news_number=1000): """ todo: get text from db? html? """ data = [] counter = 0 ids = id_from_database() for news_id in ids: if counter >= news_number: break content = Content.get_content_by_id(news_id) title = content.title description = content.description if not description: description = title data.append([news_id, title, description]) counter += 1 return data
def get_content_type_feature(content_id=0, content_data=''): """ status: optional """ data = 0 #get data if content_id: data_temp = Content.get_content_by_id(content_id).type_id if data_temp: if data_temp.isdigit(): data = int(data_temp) elif content_data: data = content_data if not data: content_type = 0 else: content_type = data.id return {'content_type': content_type}
def get_timestamp_feature(content_id=0, content_data=''): """ status: required @todo: """ date_dict = {} time_data = '' #get data if content_id: time_data = Content.get_content_by_id(content_id).timestamp elif content_data: time_data = content_data if not time_data: return {} #extract feature date_dict['timestamp_day'] = day_published(time_data) date_dict['timestamp_hour'] = hour_published(time_data) return date_dict
def get_content_type_feature(content_id=0, content_data=''): """ status: optional """ data = 0 #get data if content_id: data_temp = Content.get_content_by_id(content_id).type_id if data_temp: if data_temp.isdigit(): data = int(data_temp) elif content_data: data = content_data if not data: content_type = 0 else: content_type = data.id return {'content_type': content_type}
def get_timestamp_feature(content_id=0, content_data=''): """ status: required @todo: """ date_dict = {} time_data = '' #get data if content_id: time_data = Content.get_content_by_id(content_id).timestamp elif content_data: time_data = content_data if not time_data: return {} #extract feature date_dict['timestamp_day'] = day_published(time_data) date_dict['timestamp_hour'] = hour_published(time_data) return date_dict