def evaluate(self, id=None): if not h.auth.is_logged_in(): abort(401) c.idea = h.fetch_obj(Idea, id, new_id=True) node_q = Session.query(Node).filter_by(concept_id=id) c.node = node_q.first() if request.environ.get('REMOTE_USER', False): user = h.get_user(request.environ['REMOTE_USER']) sq = Session.query(IdeaEvaluation.cons_id) sq = sq.filter(IdeaEvaluation.ante==c.idea) sq = sq.filter(IdeaEvaluation.uid==user.ID) sq = sq.subquery() to_evaluate = c.idea.related.outerjoin((sq, Idea.ID==sq.c.cons_id)) to_evaluate = to_evaluate.filter(sq.c.cons_id==None) else: to_evaluate = c.idea.related c.paginator = paginate.Page( to_evaluate, page=int(request.params.get('page', 1)), items_per_page=10, controller='idea', action='edit', id=id ) response.headers['Access-Control-Allow-Origin'] = '*' return render('idea/idea-edit.html')
def list(self, filetype="html"): c.nodes = Session.query(Node).all() entity_q = Session.query(Node) entity_q = entity_q.limit(request.params.get("limit", None)) c.query = request.params.get("q", "") c.sep = request.params.get("sep", "") if request.params.get("sep_filter", False): entity_q = entity_q.filter(Entity.sep_dir != "") if c.sep: entity_q = entity_q.filter(Entity.sep_dir == c.sep) if c.query: o = or_(Entity.label.like(c.query + "%"), Entity.label.like("% " + c.query + "%")) entity_q = entity_q.filter(o).order_by(func.length(Entity.label)) if filetype == "json": response.content_type = "application/json" response.headers["Access-Control-Allow-Origin"] = "*" c.entities = entity_q.all() if request.params.get("redirect", False) and len(c.entities) == 1: h.redirect( h.url(controller=self._controller, action="view", filetype=filetype, id=c.entities[0].ID), code=302 ) else: return render("{type}/{type}-list.".format(type=self._controller) + filetype)
def evaluate(self, id=None): if not h.auth.is_logged_in(): abort(401) c.idea = h.fetch_obj(Idea, id, new_id=True) node_q = Session.query(Node).filter_by(concept_id=id) c.node = node_q.first() if request.environ.get('REMOTE_USER', False): user = h.get_user(request.environ['REMOTE_USER']) sq = Session.query(IdeaEvaluation.cons_id) sq = sq.filter(IdeaEvaluation.ante == c.idea) sq = sq.filter(IdeaEvaluation.uid == user.ID) sq = sq.subquery() to_evaluate = c.idea.related.outerjoin( (sq, Idea.ID == sq.c.cons_id)) to_evaluate = to_evaluate.filter(sq.c.cons_id == None) else: to_evaluate = c.idea.related c.paginator = paginate.Page(to_evaluate, page=int(request.params.get('page', 1)), items_per_page=10, controller='idea', action='edit', id=id) response.headers['Access-Control-Allow-Origin'] = '*' return render('idea/idea-edit.html')
def list(self, filetype='html'): c.nodes = Session.query(Node).all() entity_q = Session.query(Node) entity_q = entity_q.limit(request.params.get('limit', None)) c.query = request.params.get('q', '') c.sep = request.params.get('sep', '') if request.params.get('sep_filter', False): entity_q = entity_q.filter(Entity.sep_dir != '') if c.sep: entity_q = entity_q.filter(Entity.sep_dir == c.sep) if c.query: o = or_(Entity.label.like(c.query+'%'), Entity.label.like('% '+c.query+'%')) entity_q = entity_q.filter(o).order_by(func.length(Entity.label)) if filetype=='json': response.content_type = 'application/json' response.headers['Access-Control-Allow-Origin'] = '*' c.entities = entity_q.all() if request.params.get('redirect', False) and len(c.entities) == 1: h.redirect(h.url(controller=self._controller, action='view', filetype=filetype, id=c.entities[0].ID), code=302) else: return render('{type}/{type}-list.'.format(type=self._controller) + filetype)
def list(self, filetype='html'): entity_q = Session.query(self._type) #TODO: Remove the following line when Nodes are eliminated entity_q = entity_q.filter(Entity.typeID != 2) c.missing_entity = 0 # get the list of entities #c.entities = entity_q.all() c.nodes = Session.query(Node).filter(Node.parent_id == None) c.nodes = c.nodes.order_by("name").all() c.query = request.params.get('q', '') c.query = c.query.strip() c.sep = request.params.get('sep', '') c.wiki = request.params.get('wiki', '') if request.params.get('sep_filter', False): entity_q = entity_q.filter(Entity.sep_dir != '') if c.sep: entity_q = entity_q.filter(Entity.sep_dir == c.sep) if c.wiki: entity_q = entity_q.filter(Entity.wiki == c.wiki) if c.query: o = or_(Entity.label.like(c.query + '%'), Entity.label.like('% ' + c.query + '%'), Entity.label.like('%-' + c.query + '%')) entity_q = entity_q.filter(o).order_by(func.length(Entity.label)) c.total = entity_q.count() # limit must be the last thing applied to the query entity_q = entity_q.limit(request.params.get('limit', None)) c.entities = entity_q.all() if filetype == 'json': response.content_type = 'application/json' if request.params.get('redirect', False) and len(c.entities) == 1: h.redirect(h.url(controller=self._controller, action='view', filetype=filetype, id=c.entities[0].ID), code=302) else: #if there are no results, show the related SEP results if not c.entities: c.entities = self.missing_entity_search(c.query) if c.entities: c.missing_entity = 1 #raise Exception #render the page return render('{type}/{type}-list.'.format(type=self._controller) + filetype)
def list(self, filetype="html"): entity_q = Session.query(self._type) # TODO: Remove the following line when Nodes are eliminated entity_q = entity_q.filter(Entity.typeID != 2) c.missing_entity = 0 # get the list of entities # c.entities = entity_q.all() c.nodes = Session.query(Node).filter(Node.parent_id == None) c.nodes = c.nodes.order_by("name").all() c.query = request.params.get("q", "") c.query = c.query.strip() c.sep = request.params.get("sep", "") c.wiki = request.params.get("wiki", "") if request.params.get("sep_filter", False): entity_q = entity_q.filter(Entity.sep_dir != "") if c.sep: entity_q = entity_q.filter(Entity.sep_dir == c.sep) if c.wiki: entity_q = entity_q.filter(Entity.wiki == c.wiki) if c.query: o = or_( Entity.label.like(c.query + "%"), Entity.label.like("% " + c.query + "%"), Entity.label.like("%-" + c.query + "%"), ) entity_q = entity_q.filter(o).order_by(func.length(Entity.label)) c.total = entity_q.count() # limit must be the last thing applied to the query entity_q = entity_q.limit(request.params.get("limit", None)) c.entities = entity_q.all() if filetype == "json": response.content_type = "application/json" if request.params.get("redirect", False) and len(c.entities) == 1: h.redirect( h.url(controller=self._controller, action="view", filetype=filetype, id=c.entities[0].ID), code=302 ) else: # if there are no results, show the related SEP results if not c.entities: c.entities = self.missing_entity_search(c.query) if c.entities: c.missing_entity = 1 # raise Exception # render the page return render("{type}/{type}-list.".format(type=self._controller) + filetype)
def graph_all(self, filetype='html', limit=False): sep_filter = request.params.get('sep_filter', False) c.sep_filter = sep_filter idea_q = Session.query(Idea) c.ideas = idea_q.all() edge_q =\ Session.query(IdeaGraphEdge).order_by(IdeaGraphEdge.jweight.desc()).limit(3*len(c.ideas)) c.edges = edge_q.all() return render('idea/graph_all.' + filetype)
def _inpho_token_generator(document): if PUNC_TABLE.get(ord('-')): del PUNC_TABLE[ord('-')] PUNC_TABLE[ord('\n')] = ord(' ') rest = document.lower() rest = rehyph(rest) rest = strip_punc_word(rest) query = Session.query(Searchpattern) MIN_LEN = 6 short_patterns = Session.query(Searchpattern.searchpattern) short_patterns = short_patterns.filter(func.length(Searchpattern.searchpattern) < MIN_LEN) short_patterns = short_patterns.distinct().all() short_patterns = set(w[0] for w in short_patterns) while rest: if u' ' not in rest: yield rest return first, rest = rest.split(u' ', 1) rest = rest.strip() # always yield the raw string yield first # check if we can simply skip the short patterns if len(first) < MIN_LEN and first not in short_patterns: continue # search the database for keywords patterns = query.filter(Searchpattern.searchpattern.like(first + u' %')).all() exact_match = query.filter(Searchpattern.searchpattern==first).first() if exact_match is not None: patterns.append(exact_match) for p in patterns: # check if multi-phrase starts match in the rest of the phrase. if u' ' in p.searchpattern: first_pattern_word, longpattern = p.searchpattern.split(u' ', 1) if first == first_pattern_word and (rest == longpattern or rest.startswith(longpattern + u' ')): yield u"inpho:{}".format(p.entity.ID) elif first == p.searchpattern: yield u"inpho:{}".format(p.entity.ID)
def _list_property(self, property, id, filetype='html', limit=False, sep_filter=False, type='idea'): c.idea = h.fetch_obj(Idea, id) limit = int(request.params.get('limit', limit)) start = int(request.params.get('start', 0)) sep_filter = request.params.get('sep_filter', sep_filter) property = getattr(c.idea, property) if sep_filter: property = property.filter(Entity.sep_dir != '') # TODO: Fix hacky workaround for the AppenderQuery vs. Relationship # property issue - upgrading SQLAlchemy may fix this by allowing us to # use len() in a smart way. try: c.total = property.count() except TypeError: c.total = len(property) if limit: property = property[start:start+limit] c.entities = property c.nodes = Session.query(Node).filter(Node.parent_id == None).order_by("name").all() return render('%s/%s-list.%s' %(type, type, filetype))
def process_article(article, terms=None, entity_type=Idea, output_filename=None, corpus_root='corpus/'): """ Processes a single article for apriori input. """ if terms is None: terms = select_terms(entity_type) lines = [] filename = article_path(article) article_terms = Session.query(entity_type) article_terms = article_terms.filter(entity_type.sep_dir==article) article_terms = article_terms.all() if filename and os.path.isfile(filename): logging.info("processing: %s %s" % (article, filename)) doc = extract_article_body(filename) lines = dm.occurrences(doc, terms, title=article, remove_overlap=False, format_for_file=True, output_filename=output_filename) else: logging.warning("BAD SEP_DIR: %s" % article) return lines
def _get_evaluation(self, id, id2, uid=None, username=None, autoCreate=True): idea1 = h.fetch_obj(Idea, id, new_id=True) idea2 = h.fetch_obj(Idea, id2, new_id=True) # Get user information if uid: uid = h.fetch_obj(User, uid).ID elif username: user = h.get_user(username) uid = user.ID if user else abort(403) else: uid = h.get_user(request.environ['REMOTE_USER']).ID evaluation_q = Session.query(IdeaEvaluation) evaluation = evaluation_q.filter_by(ante_id=id, cons_id=id2, uid=uid).first() # if an evaluation does not yet exist, create one if autoCreate and not evaluation: evaluation = IdeaEvaluation(id, id2, uid) Session.add(evaluation) return evaluation
def update_partial_graph(entity_type, occurrences): """ Takes an entity type and a SQL filename and only updates part of the graph. For use with single article statistical information. """ raise NotImplementedError # Import SQL statements if entity_type == Idea: table = "idea_graph_edges" type = IdeaGraphEdge elif entity_type == Thinker: table = "thinker_graph_edges" type = ThinkerGraphEdge else: table = "idea_thinker_graph_edges" type = IdeaThinkerGraphEdge edges = Session.query(type) # filter edges query to only the key term for ante, occurs in occurrences.iteritems(): for cons, occurs_in in occurs.iteritems(): # select the proper edge from result set # if edge does not exist, create it and add to session #update edge edge.occurs_in = occurs_in # commit changes Session.commit()
def data_integrity(self, filetype="html", redirect=False): if not h.auth.is_logged_in(): abort(401) if not h.auth.is_admin(): abort(403) idea_q = Session.query(Idea) c.ideas = list(idea_q) # Missing searchstring c.missing_string = [ idea for idea in c.ideas if not getattr(idea, 'searchstring') ] # Missing searchpattern c.missing_pattern = [ idea for idea in c.ideas if not getattr(idea, 'searchpattern') ] # Missing sep_dir c.missing_sep_dir = [ idea for idea in c.ideas if not getattr(idea, 'sep_dir') ] # Duplicates c.duplicate = [] c.sorted_ideas = sorted(c.ideas, key=lambda idea: idea.label) for i in range(len(c.sorted_ideas) - 1): if c.sorted_ideas[i].label == c.sorted_ideas[i + 1].label: c.duplicate.append(c.sorted_ideas[i]) c.duplicate.append(c.sorted_ideas[i + 1]) return render('idea/data_integrity.%s' % filetype)
def new_entries(): """ Returns a list of all entries which do not have a corresponding InPhO Entity. """ # get list of all entries in database sep_dirs = Session.query(Entity.sep_dir).filter(Entity.sep_dir != '').all() sep_dirs = [row[0] for row in sep_dirs] # get list of all entries in the SEP database entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt') # build list of new entries new_sep_dirs = [] with open(entries) as f: for line in f: sep_dir = line.split('::', 1)[0] try: if sep_dir not in sep_dirs and copy_edit(sep_dir): # published entry not in database, add to list of entries new_sep_dirs.append(sep_dir) except IOError: # skip IOErrors, as these indicate potential entries w/o logs continue # remove the sample entry try: new_sep_dirs.remove('sample') except ValueError: pass return new_sep_dirs
def process_article(article, terms=None, entity_type=Idea, output_filename=None, corpus_root='corpus/'): """ Processes a single article for apriori input. """ if terms is None: terms = select_terms(entity_type) lines = [] filename = article_path(article) article_terms = Session.query(entity_type) article_terms = article_terms.filter(entity_type.sep_dir == article) article_terms = article_terms.all() if filename and os.path.isfile(filename): logging.info("processing: %s %s" % (article, filename)) doc = extract_article_body(filename) lines = dm.occurrences(doc, terms, title=article, remove_overlap=False, format_for_file=True, output_filename=output_filename) else: logging.warning("BAD SEP_DIR: %s" % article) return lines
def new_entries(): """ Returns a list of all entries which do not have a corresponding InPhO Entity. """ # get list of all entries in database sep_dirs = Session.query(Entity.sep_dir).filter(Entity.sep_dir!='').all() sep_dirs = [row[0] for row in sep_dirs] # get list of all entries in the SEP database entries = os.path.join(config.get('corpus', 'db_path'), 'entries.txt') # build list of new entries new_sep_dirs = [] with open(entries) as f: for line in f: sep_dir = line.split('::', 1)[0] try: if sep_dir not in sep_dirs and copy_edit(sep_dir): # published entry not in database, add to list of entries new_sep_dirs.append(sep_dir) except IOError: # skip IOErrors, as these indicate potential entries w/o logs continue # remove the sample entry try: new_sep_dirs.remove('sample') except ValueError: pass return new_sep_dirs
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() articles = Session.query(Entity.sep_dir).filter(Entity.sep_dir != None) articles = articles.filter(Entity.sep_dir != '') articles = articles.distinct().all() articles = [a[0] for a in articles] # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() #serial processing for tests ''' doc_lines = [] for title in articles: lines = process_article(title, terms, entity_type, None, corpus_root) doc_lines.append(lines) ''' # write graph output to file print output_filename with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() articles = Session.query(Entity.sep_dir).filter(Entity.sep_dir!=None) articles = articles.filter(Entity.sep_dir!='') articles = articles.distinct().all() articles = [a[0] for a in articles] # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() #serial processing for tests ''' doc_lines = [] for title in articles: lines = process_article(title, terms, entity_type, None, corpus_root) doc_lines.append(lines) ''' # write graph output to file print output_filename with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def missing_entity_search(self, query): query = quote_plus(query) url = 'http://plato.stanford.edu/cgi-bin/search/xmlSearcher.py?query=' + \ query results = multi_get([url])[0][1] json = None values_dict = [] if results: tree = ET.ElementTree(ET.fromstring(results)) root = tree.getroot() json = [] for element in root.getiterator('{http://a9.com/-/spec/opensearch/1.1/}Item'): dict = {} for iter in element.getiterator('{http://a9.com/-/spec/opensearch/1.1/}Location'): dict['Location'] = iter.text json.append(dict) for j in range(len(json)): for key,value in json[j].iteritems(): values_dict.append(value) entities = Session.query(Entity).filter(Entity.sep_dir.in_(values_dict)).all() entities.sort(key = lambda entity: values_dict.index(entity.sep_dir)) #raise Exception return entities
def select_terms(entity_type=Idea): # process entities ideas = Session.query(entity_type) ideas = ideas.options(subqueryload('_spatterns')) # do not process Nodes or Journals ideas = ideas.filter(and_(Entity.typeID != 2, Entity.typeID != 4)) return ideas.all()
def _delete_evaluation(self, evaltype, id, id2, uid=None, username=None): id2 = request.params.get('id2', id2) uid = request.params.get('uid', uid) username = request.params.get('username', username) # look for a specific user's feedback evaluation = self._get_evaluation(evaltype, id, id2, uid, username, autoCreate=False) # if that feedback does not exist, unleash the nuclear option and delete # ALL evaluation facts for this relation, wiping it from the database. if h.auth.is_admin() and not evaluation: eval_q = Session.query(evaltype) eval_q = eval_q.filter_by(ante_id=id, cons_id=id2) evals = eval_q.all() # wipe them out. all of them. for evaluation in evals: h.delete_obj(evaluation) # return ok, with how many were deleted response.status_int = 200 return "OK %d" % len(evals) elif not evaluation: abort(404) # simply return an error (not evaluated), if not admin current_uid = h.get_user(request.environ['REMOTE_USER']).ID if evaluation.uid != current_uid and not h.auth.is_admin(): abort(401) h.delete_obj(evaluation) response.status_int = 200 return "OK"
def missing_entity_search(self, query): query = quote_plus(query) url = "http://plato.stanford.edu/cgi-bin/search/xmlSearcher.py?query=" + query results = multi_get([url])[0][1] json = None values_dict = [] if results: tree = ET.ElementTree(ET.fromstring(results)) root = tree.getroot() json = [] for element in root.getiterator("{http://a9.com/-/spec/opensearch/1.1/}Item"): dict = {} for iter in element.getiterator("{http://a9.com/-/spec/opensearch/1.1/}Location"): dict["Location"] = iter.text json.append(dict) for j in range(len(json)): for key, value in json[j].iteritems(): values_dict.append(value) entities = Session.query(Entity).filter(Entity.sep_dir.in_(values_dict)).all() entities.sort(key=lambda entity: values_dict.index(entity.sep_dir)) # raise Exception return entities
def select_terms(entity_type=Idea): # process entities ideas = Session.query(entity_type) ideas = ideas.options(subqueryload('_spatterns')) # do not process Nodes or Journals ideas = ideas.filter(and_(Entity.typeID!=2, Entity.typeID!=4)) return ideas.all()
def data_integrity(self, filetype="html", redirect=False): if not h.auth.is_logged_in(): abort(401) if not h.auth.is_admin(): abort(403) idea_q = Session.query(Idea) c.ideas = list(idea_q) # Missing searchstring c.missing_string = [idea for idea in c.ideas if not getattr(idea, 'searchstring')] # Missing searchpattern c.missing_pattern = [idea for idea in c.ideas if not getattr(idea, 'searchpattern')] # Missing sep_dir c.missing_sep_dir = [idea for idea in c.ideas if not getattr(idea, 'sep_dir')] # Duplicates c.duplicate = [] c.sorted_ideas = sorted(c.ideas, key=lambda idea: idea.label) for i in range(len(c.sorted_ideas) - 1): if c.sorted_ideas[i].label == c.sorted_ideas[i+1].label: c.duplicate.append(c.sorted_ideas[i]) c.duplicate.append(c.sorted_ideas[i+1]) return render('idea/data_integrity.%s' % filetype)
def _list_property(self, property, id, filetype='html', limit=False, sep_filter=False, type='idea'): c.idea = h.fetch_obj(Idea, id) limit = int(request.params.get('limit', limit)) start = int(request.params.get('start', 0)) sep_filter = request.params.get('sep_filter', sep_filter) property = getattr(c.idea, property) if sep_filter: property = property.filter(Entity.sep_dir != '') # TODO: Fix hacky workaround for the AppenderQuery vs. Relationship # property issue - upgrading SQLAlchemy may fix this by allowing us to # use len() in a smart way. try: c.total = property.count() except TypeError: c.total = len(property) if limit: property = property[start:start + limit] c.entities = property c.nodes = Session.query(Node).filter( Node.parent_id == None).order_by("name").all() return render('%s/%s-list.%s' % (type, type, filetype))
def get_subgraph(ids, thresh=None): edge_q = Session.query(IdeaGraphEdge) edge_q = edge_q.order_by(IdeaGraphEdge.jweight.desc()) edge_q = edge_q.filter(IdeaGraphEdge.cons_id.in_(ids)) edge_q = edge_q.filter(IdeaGraphEdge.ante_id.in_(ids)) if thresh: edge_q = edge_q.filter(IdeaGraphEdge.jweight > thresh) return edge_q.all()
def get_user(login): """ Returns the User object from the model. :rtype: :class:`inpho.model.User` """ user = Session.query(User).filter(or_(User.email==login, User.username==login.lower())).first() return user
def make_list(): idea = Session.query(Idea).get(646) headings = ['Related', 'Instances', 'Hyponyms'] termslist = zip(idea.related[:10], idea.instances[:10], idea.hyponyms[:10]) template = Template(filename='lists.mako.html') print template.render(termslist=termslist, headings=headings)
def complete_mining(entity_type=Idea, filename='graph.txt', root='./', corpus_root='corpus/', update_entropy=False, update_occurrences=False, update_db=False): occur_filename = os.path.abspath(root + "occurrences.txt") graph_filename = os.path.abspath(root + "graph-" + filename) edge_filename = os.path.abspath(root + "edge-" + filename) sql_filename = os.path.abspath(root + "sql-" + filename) doc_terms = doc_terms_list() if update_occurrences: print "processing articles..." process_articles(entity_type, occur_filename, corpus_root=corpus_root) print "filtering occurrences..." filter_apriori_input(occur_filename, graph_filename, entity_type, doc_terms) print "running apriori miner..." dm.apriori(graph_filename, edge_filename) print "processing edges..." edges = dm.process_edges(graph_filename, edge_filename, occur_filename, doc_terms) ents = dm.calculate_node_entropy(edges) edges = dm.calculate_edge_weight(edges, ents) print "creating sql files..." with open(sql_filename, 'w') as f: for edge, props in edges.iteritems(): ante, cons = edge row = "%s::%s" % edge row += ("::%(confidence)s::%(jweight)s::%(weight)s" "::%(occurs_in)s\n" % props) f.write(row) if update_entropy: print "updating term entropy..." for term_id, entropy in ents.iteritems(): term = Session.query(Idea).get(term_id) if term: term.entropy = entropy Session.flush() Session.commit() Session.close() if update_db: print "updating the database..." update_graph(entity_type, sql_filename)
def data_integrity(self, filetype='html', redirect=False): if not h.auth.is_logged_in(): abort(401) if not h.auth.is_admin(): abort(403) journal_q = Session.query(Journal) # check for query if request.params.get('q'): journal_q = journal_q.filter(Journal.name.like(u'%'+request.params['q']+'%')) # get the list of journals c.journals = list(journal_q) c.missing_issn = [] c.bad_issn = [] for journal in c.journals: # Missing ISSN if not getattr(journal, 'ISSN') or journal.ISSN == '': c.missing_issn.append(journal) # Journal has bad ISSN format (xxxx-xxxx is good format) elif not re.match(r'[0-9]{4}-[0-9]{3}[0-9X]', journal.ISSN): c.bad_issn.append(journal) # Duplicates # It is set up for pairs. If there is more than 2 of the same journal it will have multiples c.duplicate = [] c.sorted_journals = sorted(c.journals, key=lambda journal: journal.label) for i in range(len(c.sorted_journals) - 1): if c.sorted_journals[i].label == c.sorted_journals[i+1].label: c.duplicate.append(c.sorted_journals[i]) c.duplicate.append(c.sorted_journals[i+1]) # re-get the list of journals (only ones accessed in last 4 weeks) # Magic constant of 2419200 corresponds to 4 weeks in seconds c.journals = list(journal_q.filter(Journal.last_accessed < (time.time() -2419200))) # filter out results into different chunks # Valid URL, not found c.broken = [journal for journal in c.journals if journal.URL] # Journal is active, no URL set c.missing = [journal for journal in c.journals if journal.URL is None and journal.active] # Journal is active, URL is set to blank c.blank = [journal for journal in c.journals if journal.URL == '' and journal.active] # Jornal is inactive and missing URL c.inactive = [journal for journal in c.journals if journal.URL is None and not journal.active] return render('journal/data_integrity.' + filetype)
def get_user(login): """ Returns the User object from the model. :rtype: :class:`inpho.model.User` """ if isinstance(login, str) or isinstance(login, unicode): user = Session.query(User).filter( or_(User.email == login, User.username == login.lower())).first() return user else: raise Exception(login)
def get_user(login): """ Returns the User object from the model. :rtype: :class:`inpho.model.User` """ if isinstance(login,str) or isinstance(login,unicode): user = Session.query(User).filter(or_(User.email==login, User.username==login.lower())).first() return user else: raise Exception(login)
def review(self): if not request.environ.get('REMOTE_USER', False): abort(401) c.user = h.get_user(request.environ['REMOTE_USER']) ieq = Session.query(IdeaEvaluation).order_by(IdeaEvaluation.time.desc()) c.evaluations = ieq.filter(and_(IdeaEvaluation.uid==c.user.ID, or_(IdeaEvaluation.generality>-1, IdeaEvaluation.relatedness>-1))).all() return render('account/review.html')
def doc_terms_list(): articles = Session.query(Entity) articles = articles.filter(Entity.sep_dir != None) articles = articles.filter(Entity.sep_dir != '') articles = articles.all() doc_terms = defaultdict(list) for entity in articles: doc_terms[entity.sep_dir].append(entity) return doc_terms
def doc_terms_list(): articles = Session.query(Entity) articles = articles.filter(Entity.sep_dir!=None) articles = articles.filter(Entity.sep_dir!='') articles = articles.all() doc_terms = defaultdict(list) for entity in articles: doc_terms[entity.sep_dir].append(entity) return doc_terms
def complete_mining(entity_type=Idea, filename='graph.txt', root='./', corpus_root='corpus/', update_entropy=False, update_occurrences=False, update_db=False): occur_filename = os.path.abspath(root + "occurrences.txt") graph_filename = os.path.abspath(root + "graph-" + filename) edge_filename = os.path.abspath(root + "edge-" + filename) sql_filename = os.path.abspath(root + "sql-" + filename) doc_terms = doc_terms_list() if update_occurrences: print "processing articles..." process_articles(entity_type, occur_filename, corpus_root=corpus_root) print "filtering occurrences..." filter_apriori_input( occur_filename, graph_filename, entity_type, doc_terms) print "running apriori miner..." dm.apriori(graph_filename, edge_filename) print "processing edges..." edges = dm.process_edges( graph_filename, edge_filename, occur_filename, doc_terms) ents = dm.calculate_node_entropy(edges) edges = dm.calculate_edge_weight(edges, ents) print "creating sql files..." with open(sql_filename, 'w') as f: for edge, props in edges.iteritems(): ante,cons = edge row = "%s::%s" % edge row += ("::%(confidence)s::%(jweight)s::%(weight)s" "::%(occurs_in)s\n" % props) f.write(row) if update_entropy: print "updating term entropy..." for term_id, entropy in ents.iteritems(): term = Session.query(Idea).get(term_id) if term: term.entropy = entropy Session.flush() Session.commit() Session.close() if update_db: print "updating the database..." update_graph(entity_type, sql_filename)
def _get_anon_evaluation(self, id, id2, ip, autoCreate=True): idea1 = h.fetch_obj(Idea, id, new_id=True) idea2 = h.fetch_obj(Idea, id2, new_id=True) evaluation_q = Session.query(AnonIdeaEvaluation) evaluation = evaluation_q.filter_by(ante_id=id, cons_id=id2, ip=ip).first() # if an evaluation does not yet exist, create one if autoCreate and not evaluation: evaluation = AnonIdeaEvaluation(id, id2,ip) Session.add(evaluation) return evaluation
def related_entries(self, id, filetype="html"): c.entity = h.fetch_obj(Entity, id) related = sep.get_related() related = related[c.entity.sep_dir] c.entities = [] for sep_dir in related: entity = Session.query(Entity).filter(Entity.sep_dir == sep_dir).first() if entity is not None: c.entities.append(entity) return render("entity/entity-list.%s" % (filetype))
def evaluation(self, id, id2): c.entity = h.fetch_obj(Idea, id) c.entity2 = h.fetch_obj(Entity, id2) if isinstance(c.entity2, Node): c.entity2 = c.entity2.idea id2 = c.entity2.ID if not isinstance(c.entity2, Idea): # no evaluation implemented response.status_int = 501 return '' c.edit = True c.alert = request.params.get('alert', True) # retrieve evaluation for pair c.generality = int(request.params.get('generality', -1)) c.relatedness = int(request.params.get('relatedness', -1)) # retrieve user information identity = request.environ.get('repoze.who.identity') c.uid = None if not identity else identity['user'].ID #TODO: Place cookie auth here try: cookie = request.params.get('cookieAuth', 'null') username = h.auth.get_username_from_cookie(cookie) or '' user = h.get_user(username) if user is not None: c.uid = user.ID except ValueError: # invalid IP, abort abort(403) # use the user's evaluation if present, otherwise a null eval if c.uid and (c.generality == -1 or c.relatedness == -1): eval_q = Session.query(IdeaEvaluation.generality, IdeaEvaluation.relatedness) eval_q = eval_q.filter_by(uid=c.uid, ante_id=id, cons_id=id2) c.generality, c.relatedness = eval_q.first() or\ (int(request.params.get('generality', -1)), int(request.params.get('relatedness', -1))) if c.relatedness != -1: c.edit = request.params.get('edit', False) return render('idea/eval.html')
def _get_anon_evaluation(self, id, id2, ip, autoCreate=True): idea1 = h.fetch_obj(Idea, id, new_id=True) idea2 = h.fetch_obj(Idea, id2, new_id=True) evaluation_q = Session.query(AnonIdeaEvaluation) evaluation = evaluation_q.filter_by(ante_id=id, cons_id=id2, ip=ip).first() # if an evaluation does not yet exist, create one if autoCreate and not evaluation: evaluation = AnonIdeaEvaluation(id, id2, ip) Session.add(evaluation) return evaluation
def related_entries(self, id, filetype='html'): c.entity = h.fetch_obj(Entity,id) related = sep.get_related() related = related[c.entity.sep_dir] c.entities = [] for sep_dir in related: entity = Session.query(Entity).filter(Entity.sep_dir==sep_dir).first() if entity is not None: c.entities.append(entity) return render('entity/entity-list.%s' %(filetype))
def review(self): if not request.environ.get('REMOTE_USER', False): abort(401) c.user = h.get_user(request.environ['REMOTE_USER']) ieq = Session.query(IdeaEvaluation).order_by( IdeaEvaluation.time.desc()) c.evaluations = ieq.filter( and_( IdeaEvaluation.uid == c.user.ID, or_(IdeaEvaluation.generality > -1, IdeaEvaluation.relatedness > -1))).all() return render('account/review.html')
def from_dlv(filename, load_obj=False): """ Function to build a taxonomy from the specified DLV output file. """ # build regex for instance and link search regex_class = re.compile("class\(i(\d+)\)") regex_ins = re.compile("[ins|isa]\(i(\d+),i(\d+)\)") regex_links = re.compile("link\(i(\d+),i(\d+)\)") # process DLV output file with open(filename) as f: dlv = f.read() classes = frozenset(regex_class.findall(dlv)) instances = frozenset(regex_ins.findall(dlv)) links = frozenset(regex_links.findall(dlv)) # set up taxonomy structure nodes = defaultdict(Node) root = Node("Philosophy", spine=True) # populate instances for child, parent in instances: nodes[parent].graft(nodes[child]) # populate links for target, source in links: nodes[source].links.add(nodes[target]) # glue taxonomies together, initialize values for key,node in nodes.iteritems(): # load the database objects if load_obj: node.value = Session.query(Entity).get(key) else: node.value = key # specify hand-built portion of the taxonomy if node.value in classes: node.spine = True # if this is a root, glue it to the Philosophy node. if node.parent is None: root.graft(node) return root
def from_dlv(filename, load_obj=False): """ Function to build a taxonomy from the specified DLV output file. """ # build regex for instance and link search regex_class = re.compile("class\(i(\d+)\)") regex_ins = re.compile("[ins|isa]\(i(\d+),i(\d+)\)") regex_links = re.compile("link\(i(\d+),i(\d+)\)") # process DLV output file with open(filename) as f: dlv = f.read() classes = frozenset(regex_class.findall(dlv)) instances = frozenset(regex_ins.findall(dlv)) links = frozenset(regex_links.findall(dlv)) # set up taxonomy structure nodes = defaultdict(Node) root = Node("Philosophy", spine=True) # populate instances for child, parent in instances: nodes[parent].graft(nodes[child]) # populate links for target, source in links: nodes[source].links.add(nodes[target]) # glue taxonomies together, initialize values for key, node in nodes.iteritems(): # load the database objects if load_obj: node.value = Session.query(Entity).get(key) else: node.value = key # specify hand-built portion of the taxonomy if node.value in classes: node.spine = True # if this is a root, glue it to the Philosophy node. if node.parent is None: root.graft(node) return root
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() # fix search patterns for term in terms: newpatterns = [] for pattern in term.searchpatterns: if '(' in pattern and ')' in pattern: pattern = pattern.replace('( ', '(\\b') pattern = pattern.replace(' )', '\\b)') else: pattern = '\\b%s\\b' % pattern.strip() newpatterns.append(pattern) term.searchpatterns = newpatterns articles = Session.query(Entity.sep_dir).filter(Entity.sep_dir!=None) articles = articles.filter(Entity.sep_dir!='') articles = articles.distinct().all() articles = [a[0] for a in articles] # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() #serial processing for tests ''' doc_lines = [] for title in articles: lines = process_article(title, terms, entity_type, None, corpus_root) doc_lines.append(lines) ''' # write graph output to file print output_filename with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def load_check(self, filetype="html", redirect=False): if not h.auth.is_logged_in(): abort(401) if not h.auth.is_admin(): abort(403) entity_q = Session.query(Entity) c.entities = list(entity_q) c.load_error = [] for entity in c.entities: try: urlopen(h.url('https://www.inphoproject.org', getattr(entity, 'url'))) except Exception as e: c.load_error.append(entity) return render('entity/load_check.' + filetype)
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() articles = Session.query(entity_type).filter(entity_type.sep_dir!='').all() # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() # write graph output to file with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def fuzzymatch_all(string1): """ Takes a string and returns all potential fuzzymatches from the Entity database. Matches are returned as a list of (entity,confidence) tuples. """ # construct Entity query entities = Session.query(Entity) entities = entities.filter(Entity.typeID != 2) # exclude nodes entities = entities.filter(Entity.typeID != 4) # exclude journals # initialize result object matches = [] # build results for entity in entities: confidence, distance = fuzzymatch(string1, entity.label) if confidence >= 0.5: matches.append((entity, confidence)) return matches
def data_integrity(self, filetype="html", redirect=False): if not h.auth.is_logged_in(): abort(401) if not h.auth.is_admin(): abort(403) entity_q = Session.query(Entity) c.entities = list(entity_q) c.missing_sep_dir = [] c.mult_sep_dir = [] for entity in c.entities: if not getattr(entity, 'sep_dir'): c.missing_sep_dir.append(entity) else: for comp_entity in c.entities: if getattr(entity, 'sep_dir') == getattr(comp_entity, 'sep_dir') and entity != comp_entity: c.mult_sep_dir.append(getattr(entity, 'sep_dir')) return render('entity/data_integrity.' + filetype)
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() articles = Session.query(entity_type).filter( entity_type.sep_dir != '').all() # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() # write graph output to file with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def fetch_obj(type, id, error=404, new_id=False): """ Fetches the object with the given id from the collection of type type. If the object does not exist, throw an HTTP error (default: 404 Not Found). :param type: object type :type type: class in :mod:`inpho.model` :param id: object id :type id: integer or None :param error: HTTP error code. :rtype: *type* """ if id is None: abort(error) obj_q = Session.query(type) obj = obj_q.get(int(id)) #else: # obj = obj_q.filter(type.ID==int(id)).first() if obj is None: abort(error) return obj
def data_integrity(self, filetype='html', redirect=False): if not h.auth.is_logged_in(): abort(401) if not h.auth.is_admin(): abort(403) school_q = Session.query(SchoolOfThought) c.schools = list(school_q) # Missing sep_dir c.missing_sep_dir = [ school for school in c.schools if not getattr(school, "sep_dir") ] # Duplicates c.duplicate = [] c.sorted_schools = sorted(c.schools, key=lambda school: school.label) for i in range(len(c.sorted_schools) - 1): if c.sorted_schools[i].label == c.sorted_schools[i + 1].label: c.duplicate.append(c.sorted_schools[i]) c.duplicate.append(c.sorted_schools[i + 1]) return render('school_of_thought/data_integrity.%s' % filetype)