def get_phr_content(q): """Given a query it returns the portion enclosed in quotes :params q : Query string""" extracts, non_extracts = helpers.get_extracts(q, 0) try: return extracts[0] except: return ''
def create_law_links(self): """Creates links from existing laws""" for identifier, law in self.laws.items(): articles = law.sentences.keys() self.detect_and_apply_removals(identifier=identifier, generate_links=True) for article in articles: for paragraph in law.get_paragraphs(article): try: extracts, non_extracts = helpers.get_extracts( paragraph, 0) for entity in entities.LegalEntities.entities: # If law found in amendment body then it is # modifying for s in non_extracts: neighbors = re.finditer(entity, s) neighbors = set([neighbor.group().lower() for neighbor in neighbors]) tmp = s.split(' ') for u in map(lambda x: x.lower(), neighbors): if u not in self.links: self.links[u] = Link(u) is_modifying = False for action in entities.actions: for i, w in enumerate(tmp): if action == w: is_modifying = True break if is_modifying: break if is_modifying: self.links[u].add_link( law.identifier, paragraph, link_type='τροποποιητικός') else: self.links[u].add_link( law.identifier, paragraph, link_type='αναφορικός') # If enclosed in brackets the link is only # referential for s in extracts: neighbors = re.finditer(entity, s) neighbors = set([neighbor.group().lower() for neighbor in neighbors]) for u in map(lambda x: x.lower(), neighbors): u = u.lower() if u not in self.links: self.links[u] = Link(u) self.links[u].add_link( law.identifier, paragraph, link_type='αναφορικός') # except there are Unmatched brackets except Exception as e: neighbors = re.finditer(entity, paragraph) neighbors = set([neighbor.group().lower() for neighbor in neighbors]) for u in map(lambda x: x.lower(), neighbors): if u not in self.links: self.links[u] = Link(u) self.links[u].add_link( law.identifier, paragraph, link_type='γενικός') for link in self.links.values(): try: self.db.links.save(link.serialize()) except: pass
sys.path.insert(0, '../3gm') if __name__ == '__main__': counter = 0 cod = codifier.LawCodifier(sys.argv[1]) f = open(sys.argv[2], 'w+') cod.codify_new_laws() for identifier, law in cod.laws.items(): for article in law.sentences: print(article) for s in law.get_paragraphs(article): global actions global whats trees = [] try: extracts, non_extracts = helpers.get_extracts(s) except: counter += 1 continue non_extracts = ' '.join(non_extracts) non_extracts = tokenizer.tokenizer.split(non_extracts, delimiter='. ') for non_extract in non_extracts: tmp = list( map(lambda s: s.strip(string.punctuation), non_extract.split(' '))) for action in actions: for i, w in enumerate(tmp): if action == w:
def generate_action_tree_from_string(s, nested=False, max_what_window=20, max_where_window=30, use_regex=False): """Main algorithm for amendment detection The approach followed is hybrid The procedure is outlined here: https://github.com/eellak/gsoc2018-3gm/wiki/Algorithms-for-analyzing-Government-Gazette-Documents """ # results are stored here trees = [] # fix par abbrev s = helpers.fix_par_abbrev(s) # get extracts and non-extracts using helper functions parts = tokenizer.tokenizer.split(s, False, '. ') extracts, non_extracts = helpers.get_extracts(s) non_extracts = ' '.join(non_extracts) non_extracts = tokenizer.tokenizer.split(non_extracts, True, '. ') extract_cnt = 0 for part_cnt, non_extract in enumerate(non_extracts): doc = nlp(non_extract) tmp = list( map(lambda s: s.strip(string.punctuation), non_extract.split(' '))) # Detect amendment action for action in entities.actions: for i, w in enumerate(doc): if action == w.text: tree = collections.defaultdict(dict) tree['root'] = { '_id': i, 'action': action.__str__(), 'children': [] } max_depth = 0 logging.info('Found ' + str(action)) extract = None if str(action) not in [ 'διαγράφεται', 'παύεται', 'καταργείται' ]: try: extract = extracts[extract_cnt] extract_cnt += 1 except IndexError: extract = None # Detect what is amended found_what, tree, is_plural = ActionTreeGenerator.get_nsubj( doc, i, tree) if found_what: k = tree['what']['index'] if tree['what']['context'] not in [ 'φράση', 'φράσεις', 'λέξη', 'λέξεις' ]: tree['what']['number'] = list( helpers.ssconj_doc_iterator( doc, k, is_plural)) else: tree = phrase_fun.detect_phrase_components( parts[part_cnt], tree) tree['what']['context'] = 'φράση' logging.info(tree['what']) else: found_what, tree, is_plural = ActionTreeGenerator.get_nsubj_fallback( tmp, tree, i) # get content if action not in [ 'διαγράφεται', 'διαγράφονται', 'αναριθμείται', 'αναριθμούνται' ]: tree, max_depth = ActionTreeGenerator.get_content( tree, extract, s) if action in ['αναριθμείται', 'αναριθμούνται']: # get renumbering tree = ActionTreeGenerator.get_renumbering( tree, doc) subtrees = ActionTreeGenerator.split_renumbering_tree( tree) # split to subtrees if action not in ['αναριθμείται', 'αναριθμούνται']: subtrees = ActionTreeGenerator.split_tree(tree) # iterate over subtrees for subtree in subtrees: subtree, max_depth = ActionTreeGenerator.get_content( subtree, extract, s, secondary=True) # get latest statute try: law = ActionTreeGenerator.detect_latest_statute( non_extract) except BaseException: law = '' # first level are laws subtree['law'] = { '_id': law, 'children': ['article'] } splitted = non_extract.split(' ') # build levels bottom up subtree = ActionTreeGenerator.build_levels( splitted, subtree) # nest into dictionary if nested: ActionTreeGenerator.nest_tree('root', subtree) trees.append(subtree) return trees